diff --git a/llvm/projects/CMakeLists.txt b/llvm/projects/CMakeLists.txt
index d00a1a056c55..0a52fddeb72a 100644
--- a/llvm/projects/CMakeLists.txt
+++ b/llvm/projects/CMakeLists.txt
@@ -11,6 +11,7 @@ foreach(entry ${entries})
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/libunwind) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/test-suite) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/parallel-libs) AND
+       (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/sycl) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/llvm-spirv) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/openmp) AND
        (NOT ${entry} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR}/debuginfo-tests))
@@ -43,6 +44,7 @@ endif()
 add_llvm_external_project(dragonegg)
 add_llvm_external_project(parallel-libs)
 add_llvm_external_project(openmp)
+add_llvm_external_project(sycl)
 add_llvm_external_project(llvm-spirv)
 
 if(LLVM_INCLUDE_TESTS)
diff --git a/sycl/.clang-tidy b/sycl/.clang-tidy
new file mode 100644
index 000000000000..0af3553a0cad
--- /dev/null
+++ b/sycl/.clang-tidy
@@ -0,0 +1 @@
+Checks: '-*,clang-analyzer-*,clang-diagnostic-*,cppcoreguidelines-*,-cppcoreguidelines-pro-bounds-array-to-pointer-decay,-cppcoreguidelines-pro-bounds-constant-array-index,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-cppcoreguidelines-pro-type-member-init,google-*,-cppcoreguidelines-pro-type-union-access,-google-build-using-namespace,-google-explicit-constructor,-google-runtime-references,misc-*,-misc-macro-parentheses,-misc-unused-parameters'
diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
new file mode 100644
index 000000000000..9e1fe82ebc10
--- /dev/null
+++ b/sycl/CMakeLists.txt
@@ -0,0 +1,147 @@
+cmake_minimum_required(VERSION 3.2)
+
+project(sycl-solution)
+# Requirements
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+if(MSVC)
+  set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+endif()
+
+# Get clang's version
+include(VersionFromVCS)
+set(PACKAGE_VERSION "${LLVM_PACKAGE_VERSION}")
+
+# If CLANG_VERSION_* is specified, use it, if not use LLVM_VERSION_*.
+if(NOT DEFINED CLANG_VERSION_MAJOR)
+  set(CLANG_VERSION_MAJOR ${LLVM_VERSION_MAJOR})
+endif()
+if(NOT DEFINED CLANG_VERSION_MINOR)
+  set(CLANG_VERSION_MINOR ${LLVM_VERSION_MINOR})
+endif()
+if(NOT DEFINED CLANG_VERSION_PATCHLEVEL)
+  set(CLANG_VERSION_PATCHLEVEL ${LLVM_VERSION_PATCH})
+endif()
+# Unlike PACKAGE_VERSION, CLANG_VERSION does not include LLVM_VERSION_SUFFIX.
+set(CLANG_VERSION "${CLANG_VERSION_MAJOR}.${CLANG_VERSION_MINOR}.${CLANG_VERSION_PATCHLEVEL}")
+
+set ( LLVM_INST_INC_DIRECTORY "lib${LLVM_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include" )
+
+find_package(OpenCL REQUIRED)
+
+include_directories(${OpenCL_INCLUDE_DIRS})
+link_libraries(OpenCL)
+
+# Copy SYCL headers
+set(sycl_inc_dir ${CMAKE_CURRENT_SOURCE_DIR}/include/CL)
+set(dst_dir ${LLVM_LIBRARY_OUTPUT_INTDIR}/clang/${CLANG_VERSION}/include/CL)
+add_custom_target(sycl-headers ALL
+COMMAND ${CMAKE_COMMAND} -E copy_directory ${sycl_inc_dir} ${dst_dir}
+COMMENT "Copying SYCL headers ...")
+
+# Main library
+
+set(sourceRootPath "${CMAKE_CURRENT_SOURCE_DIR}/source")
+set(includeRootPath "${CMAKE_CURRENT_SOURCE_DIR}/include")
+
+set(SYCLLibrary sycl)
+
+#To-Do:
+#1. Figure out why CMP0057 has to be set. Should have been taken care of earlier in the build
+#2. Use AddLLVM to modify the build and access config options
+#cmake_policy(SET CMP0057 NEW)
+#include(AddLLVM)
+set(LLVM_BUILD_LIBRARY_DIRS "${LLVM_BINARY_DIR}/lib/")
+
+set(SYCL_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+set(SYCL_TESTS_BINARY_DIR ${SYCL_BINARY_DIR}/test)
+
+set(CLANG_IN_BUILD "${LLVM_BINARY_DIR}/bin/clang")
+
+set(LLVM_TOOLS_DIR "${LLVM_BINARY_DIR}/bin/")
+
+set(SYCL_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/include/")
+set(OPENCL_INCLUDE "${OpenCL_INCLUDE_DIRS}")
+
+add_library("${SYCLLibrary}" SHARED
+  "${includeRootPath}/CL/sycl.hpp"
+  "${sourceRootPath}/detail/common.cpp"
+  "${sourceRootPath}/detail/device_info.cpp"
+  "${sourceRootPath}/detail/event_impl.cpp"
+  "${sourceRootPath}/detail/force_device.cpp"
+  "${sourceRootPath}/detail/helpers.cpp"
+  "${sourceRootPath}/detail/kernel_impl.cpp"
+  "${sourceRootPath}/detail/kernel_info.cpp"
+  "${sourceRootPath}/detail/platform_host.cpp"
+  "${sourceRootPath}/detail/platform_opencl.cpp"
+  "${sourceRootPath}/detail/platform_info.cpp"
+  "${sourceRootPath}/detail/program_impl.cpp"
+  "${sourceRootPath}/detail/program_manager/program_manager.cpp"
+  "${sourceRootPath}/detail/queue_impl.cpp"
+  "${sourceRootPath}/detail/scheduler/commands.cpp"
+  "${sourceRootPath}/detail/scheduler/printers.cpp"
+  "${sourceRootPath}/detail/scheduler/scheduler.cpp"
+  "${sourceRootPath}/context.cpp"
+  "${sourceRootPath}/device.cpp"
+  "${sourceRootPath}/device_selector.cpp"
+  "${sourceRootPath}/event.cpp"
+  "${sourceRootPath}/exception.cpp"
+  "${sourceRootPath}/kernel.cpp"
+  "${sourceRootPath}/platform.cpp"
+  "${sourceRootPath}/queue.cpp"
+  "${sourceRootPath}/spirv_ops.cpp"
+)
+
+include_directories("${SYCLLibrary}" "${includeRootPath}")
+
+target_link_libraries("${SYCLLibrary}" "${OpenCL_LIBRARIES}")
+set_target_properties("${SYCLLibrary}" PROPERTIES LINKER_LANGUAGE CXX)
+
+# Workaround for bug in GCC version 5.
+# More information https://bugs.launchpad.net/ubuntu/+source/gcc-5/+bug/1568899
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+    CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0 AND
+    CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.0)
+  target_link_libraries("${SYCLLibrary}" gcc_s gcc)
+endif()
+
+install(TARGETS "${SYCLLibrary}" DESTINATION "lib" COMPONENT ${SYCLLibrary})
+install(DIRECTORY "${includeRootPath}/." DESTINATION "${LLVM_INST_INC_DIRECTORY}" COMPONENT sycl_headers)
+
+add_subdirectory( test )
+add_subdirectory( tools )
+
+set(manifest_list)
+set( DEPLOY_LIST
+  sycl
+  ocl_lib
+  ocl_headers
+  sycl_headers
+  clang
+  clang-offload-wrapper
+  clang-offload-bundler
+  llc
+  llvm-as
+  llvm-dis
+  llvm-spirv
+  llvm-link
+  opt
+)
+
+foreach( comp ${DEPLOY_LIST} )
+
+  message( STATUS "Adding component ${comp} to deploy")
+
+  set (manifest ${CMAKE_CURRENT_BINARY_DIR}/install_manifest_${comp}.txt)
+  add_custom_command(OUTPUT ${manifest}
+                     COMMAND "${CMAKE_COMMAND}"
+                             "-DCMAKE_INSTALL_COMPONENT=${comp}"
+                             -P "${CMAKE_BINARY_DIR}/cmake_install.cmake"
+                     COMMENT "Deploying component ${comp}"
+                     USES_TERMINAL)
+  list(APPEND manifest_list ${manifest})
+endforeach( comp )
+
+add_custom_target(deploy DEPENDS ${manifest_list})
diff --git a/sycl/LICENSE.TXT b/sycl/LICENSE.TXT
new file mode 100644
index 000000000000..461398bab7a7
--- /dev/null
+++ b/sycl/LICENSE.TXT
@@ -0,0 +1,68 @@
+==============================================================================
+LLVM Release License
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2003-2018 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
+
+==============================================================================
+Copyrights and Licenses for Third Party Software Distributed with LLVM:
+==============================================================================
+The LLVM software contains code written by third parties.  Such software will
+have its own individual LICENSE.TXT file in the directory in which it appears.
+This file will describe the copyrights, license, and restrictions which apply
+to that code.
+
+The disclaimer of warranty in the University of Illinois Open Source License
+applies to all code in the LLVM Distribution, and nothing in any of the
+other licenses gives permission to use the names of the LLVM Team or the
+University of Illinois to endorse or promote products derived from this
+Software.
+
+The following pieces of software have additional or alternate copyrights,
+licenses, and/or restrictions:
+
+Program             Directory
+-------             ---------
+Google Test         llvm/utils/unittest/googletest
+OpenBSD regex       llvm/lib/Support/{reg*, COPYRIGHT.regex}
+pyyaml tests        llvm/test/YAMLParser/{*.data, LICENSE.TXT}
+ARM contributions   llvm/lib/Target/ARM/LICENSE.TXT
+md5 contributions   llvm/lib/Support/MD5.cpp llvm/include/llvm/Support/MD5.h
diff --git a/sycl/LICENSE2.TXT b/sycl/LICENSE2.TXT
new file mode 100644
index 000000000000..f9dc50615d7e
--- /dev/null
+++ b/sycl/LICENSE2.TXT
@@ -0,0 +1,219 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+
+--- LLVM Exceptions to the Apache 2.0 License ----
+
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
+
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
+
diff --git a/sycl/doc/GetStartedWithSYCLCompiler.md b/sycl/doc/GetStartedWithSYCLCompiler.md
new file mode 100644
index 000000000000..b5ee86c3bd1c
--- /dev/null
+++ b/sycl/doc/GetStartedWithSYCLCompiler.md
@@ -0,0 +1,190 @@
+# Overview
+
+The SYCL* Compiler compiles C++\-based SYCL source files with code for both CPU and a wide range of compute accelerators. The compiler uses Khronos* OpenCL&trade; API to offload computations to accelerators.
+
+# Before You Begin
+
+Software requirements:
+
+Installing OpenCL 2.1 compatible software stack:
+1. OpenCL headers:
+
+   a. Download the OpenCL headers from [github.com/KhronosGroup/OpenCL-Headers](https://github.com/KhronosGroup/OpenCL-Headers) to your local machine. e.g. `/usr/local/include/CL` with environment var `$OPENCL_HEADERS`.
+2. OpenCL runtime for CPU and GPU:
+
+   a. OpenCL runtime for GPU: follow instructions on [github.com/intel/compute-runtime/releases](https://github.com/intel/compute-runtime/releases) to install.
+
+   b. OpenCL runtime for CPU: follow instructions under section "Intel&reg; CPU Runtime for OpenCL. Applications 18.1 for Linux* OS (64bit only)" on [https://software.intel.com/en-us/articles/opencl-drivers#cpu-section](https://software.intel.com/en-us/articles/opencl-drivers#cpu-section) and click on orange "Download" button to download & install.
+
+# Build the SYCL compiler
+
+Download the LLVM* repository with SYCL support to your local machine folder e.g. `$HOME/sycl` (assuming environment var `$SYCL_HOME`) folder using following command:
+
+```
+git clone https://github.com/intel/llvm -b sycl $HOME/sycl
+```
+
+Follow regular LLVM build instructions under: [llvm.org/docs/CMake.html](https://llvm.org/docs/CMake.html). To build SYCL runtime use modified CMake command below:
+
+```
+mkdir $SYCL_HOME/build
+cd $SYCL_HOME/build
+cmake -DCMAKE_BUILD_TYPE=Release -DOpenCL_INCLUDE_DIR=$OPENCL_HEADERS -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_EXTERNAL_PROJECTS="sycl;llvm-spirv" -DLLVM_EXTERNAL_SYCL_SOURCE_DIR=$SYCL_HOME/sycl -DLLVM_EXTERNAL_LLVM_SPIRV_SOURCE_DIR=$SYCL_HOME/llvm-spirv -DLLVM_TOOL_SYCL_BUILD=ON -DLLVM_TOOL_LLVM_SPIRV_BUILD=ON $SYCL_HOME/llvm
+make -j`nproc` check-all
+```
+
+After the build completed, the SYCL compiler/include/libraries can be found under `$SYCL_HOME/build` directory.
+
+# Creating a simple SYCL program
+
+A simple SYCL program consists of following parts:
+1. Header section
+2. Allocating buffer for data
+3. Creating SYCL queue
+4. Submitting command group to SYCL queue which includes the kernel
+5. Wait for the queue to complete the work
+6. Use buffer accessor to retrieve the result on the device and verify the data
+7. The end
+
+Creating a file `simple-sycl-app.cpp` with the following C++ SYCL code in it:
+
+```
+
+#include <CL/sycl.hpp>
+
+int main() {
+  // Creating buffer of 4 ints to be used inside the kernel code
+  cl::sycl::buffer<cl::sycl::cl_int, 1> Buffer(4);
+
+  // Creating SYCL queue
+  cl::sycl::queue Queue;
+
+  // Size of index space for kernel
+  cl::sycl::range<1> NumOfWorkItems{Buffer.get_count()};
+
+  // Submitting command group(work) to queue
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    // Getting write only access to the buffer on a device
+    auto Accessor = Buffer.get_access<cl::sycl::access::mode::write>(cgh);
+    // Executing kernel
+    cgh.parallel_for<class FillBuffer>(
+        NumOfWorkItems, [=](cl::sycl::id<1> WIid) {
+          // Fill buffer with indexes
+          Accessor[WIid] = (cl::sycl::cl_int)WIid.get(0);
+        });
+  });
+
+  // Getting read only access to the buffer on the host.
+  // Implicit barrier waiting for queue to complete the work.
+  const auto HostAccessor = Buffer.get_access<cl::sycl::access::mode::read>();
+
+  // Check the results
+  bool MismatchFound = false;
+  for (size_t I = 0; I < Buffer.get_count(); ++I) {
+    if (HostAccessor[I] != I) {
+      std::cout << "The result is incorrect for element: " << I
+                << " , expected: " << I << " , got: " << HostAccessor[I]
+                << std::endl;
+      MismatchFound = true;
+    }
+  }
+
+  if (!MismatchFound) {
+    std::cout << "The results are correct!" << std::endl;
+  }
+
+  return MismatchFound;
+}
+
+```
+
+# Build and Test a simple SYCL program
+The SYCL Compiler supports two types of compilation:
+
+1. Simplified one step that compiles to binary directly
+
+   ```
+   clang++ -std=c++11 -fsycl simple-sycl-app.cpp -o simple-sycl-app -lsycl -lOpenCL
+   ```
+
+2. Manual two steps compilation that compiles device (to SPIR-V) and host code separately (to binary)
+
+   a. Compile the device code from the C++ file into the SPIR-V file:
+
+   ```
+   clang++ --sycl -Xclang -fsycl-int-header=simple-sycl-app-int-header.h -c simple-sycl-app.cpp -o kernel.spv
+   # NOTE: The section "-Xclang -fsycl-int-header=simple-sycl-app-int-header.h"
+   #       generates `integration header` file.
+   #       This file must be included for the host side compilation.
+   # NOTE: The output file name must be kernel.spv
+   ```
+
+   b. Compile host code from the same C++ file into an executable:
+
+   ```
+   clang++ -std=c++11 -include simple-sycl-app-int-header.h simple-sycl-app.cpp -o simple-sycl-app -lsycl -lOpenCL
+   # NOTE: The section "-include simple-sycl-app-int-header.h" includes
+   #       integration header file, which is produced by the device compiler.
+   ```
+
+This `simple-sycl-app` application doesn't specify SYCL device for execution, so SYCL runtime will first try to execute on OpenCL GPU device first, if OpenCL GPU device is not found, it will try to run OpenCL CPU device; and if OpenCL CPU device is also not available, SYCL runtime will run on SYCL host device.
+
+To run the `simple-sycl-app`:
+
+    LD_LIBRARY_PATH=$SYCL_HOME/build/lib ./simple-sycl-app
+    The results are correct!
+
+NOTE: SYCL developer can specify SYCL device for execution using device selectors (e.g. `cl::sycl::cpu_selector`, `cl::sycl::gpu_selector`) as explained in following section [Code the program for a specific GPU](#code-the-program-for-a-specific-gpu).
+
+# Code the program for a specific GPU
+
+To specify OpenCL device SYCL provides the abstract `cl::sycl::device_selector` class which the can be used to define how the runtime should select the best device.
+
+The method `cl::sycl::device_selector::operator()` of the SYCL `cl::sycl::device_selector` is an abstract member function which takes a reference to a SYCL device and returns an integer score. This abstract member function can be implemented in a derived class to provide a logic for selecting a SYCL device. SYCL runtime uses the device for with the highest score is returned. Such object can be passed to `cl::sycl::queue` and `cl::sycl::device` constructors.
+
+The example below illustrates how to use `cl::sycl::device_selector` to create device and queue objects bound to Intel GPU device:
+
+```
+#include <CL/sycl.hpp>
+
+int main() {
+  class NEOGPUDeviceSelector : public cl::sycl::device_selector {
+  public:
+    int operator()(const cl::sycl::device &Device) const override {
+      using namespace cl::sycl::info;
+
+      const std::string DeviceName = Device.get_info<device::name>();
+      const std::string DeviceVendor = Device.get_info<device::vendor>();
+
+      return Device.is_gpu() && DeviceName.find("HD Graphics NEO") ? 1 : -1;
+    }
+  };
+
+  NEOGPUDeviceSelector Selector;
+  try {
+    cl::sycl::queue Queue(Selector);
+    cl::sycl::device Device(Selector);
+  } catch (cl::sycl::invalid_parameter_error &E) {
+    std::cout << E.what() << std::endl;
+  }
+}
+
+```
+
+
+# Known Issues or Limitations
+
+- SYCL device compiler fails if the same kernel was used in different translation units.
+- SYCL host device is not fully supported.
+- SYCL works only with OpenCL implementations supporting out-of-order queues.
+- `math.h` header is conflicting with SYCL headers. Please use `cmath` as a workaround for now like below:
+
+```
+//#include <math.h>  // conflicting
+#include <cmath>
+```
+
+# Find More
+
+SYCL 1.2.1 specification: [www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf](https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf)
+
diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
new file mode 100644
index 000000000000..3197620f41e6
--- /dev/null
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -0,0 +1,150 @@
+//==---------- spirv_ops.hpp --- SPIRV operations -------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/__spirv/spirv_types.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace cl {
+namespace __spirv {
+
+#ifdef __SYCL_DEVICE_ONLY__
+
+template <typename dataT>
+extern OpTypeEvent *
+OpGroupAsyncCopy(int32_t Scope, __local dataT *Dest, __global dataT *Src,
+                 size_t NumElements, size_t Stride, OpTypeEvent *E) noexcept;
+
+template <typename dataT>
+extern OpTypeEvent *
+OpGroupAsyncCopy(int32_t Scope, __global dataT *Dest, __local dataT *Src,
+                 size_t NumElements, size_t Stride, OpTypeEvent *E) noexcept;
+
+#define OpGroupAsyncCopyGlobalToLocal OpGroupAsyncCopy
+#define OpGroupAsyncCopyLocalToGlobal OpGroupAsyncCopy
+
+// Atomic SPIR-V builtins
+#define __SPIRV_ATOMIC_LOAD(AS, Type)                                          \
+  extern Type OpAtomicLoad(AS Type *P, Scope S, MemorySemantics O);
+#define __SPIRV_ATOMIC_STORE(AS, Type)                                         \
+  extern void OpAtomicStore(AS Type *P, Scope S, MemorySemantics O, Type V);
+#define __SPIRV_ATOMIC_EXCHANGE(AS, Type)                                      \
+  extern Type OpAtomicExchange(AS Type *P, Scope S, MemorySemantics O, Type V);
+#define __SPIRV_ATOMIC_CMP_EXCHANGE(AS, Type)                                  \
+  extern Type OpAtomicCompareExchange(AS Type *P, Scope S, MemorySemantics E,  \
+                                      MemorySemantics U, Type V, Type C);
+#define __SPIRV_ATOMIC_IADD(AS, Type)                                          \
+  extern Type OpAtomicIAdd(AS Type *P, Scope S, MemorySemantics O, Type V);
+#define __SPIRV_ATOMIC_ISUB(AS, Type)                                          \
+  extern Type OpAtomicISub(AS Type *P, Scope S, MemorySemantics O, Type V);
+#define __SPIRV_ATOMIC_SMIN(AS, Type)                                          \
+  extern Type OpAtomicSMin(AS Type *P, Scope S, MemorySemantics O, Type V);
+#define __SPIRV_ATOMIC_UMIN(AS, Type)                                          \
+  extern Type OpAtomicUMin(AS Type *P, Scope S, MemorySemantics O, Type V);
+#define __SPIRV_ATOMIC_SMAX(AS, Type)                                          \
+  extern Type OpAtomicSMax(AS Type *P, Scope S, MemorySemantics O, Type V);
+#define __SPIRV_ATOMIC_UMAX(AS, Type)                                          \
+  extern Type OpAtomicUMax(AS Type *P, Scope S, MemorySemantics O, Type V);
+#define __SPIRV_ATOMIC_AND(AS, Type)                                           \
+  extern Type OpAtomicAnd(AS Type *P, Scope S, MemorySemantics O, Type V);
+#define __SPIRV_ATOMIC_OR(AS, Type)                                            \
+  extern Type OpAtomicOr(AS Type *P, Scope S, MemorySemantics O, Type V);
+#define __SPIRV_ATOMIC_XOR(AS, Type)                                           \
+  extern Type OpAtomicXor(AS Type *P, Scope S, MemorySemantics O, Type V);
+
+#define __SPIRV_ATOMIC_FLOAT(AS, Type)                                         \
+  __SPIRV_ATOMIC_LOAD(AS, Type)                                                \
+  __SPIRV_ATOMIC_STORE(AS, Type)                                               \
+  __SPIRV_ATOMIC_EXCHANGE(AS, Type)
+
+#define __SPIRV_ATOMIC_BASE(AS, Type)                                          \
+  __SPIRV_ATOMIC_FLOAT(AS, Type)                                               \
+  __SPIRV_ATOMIC_CMP_EXCHANGE(AS, Type)                                        \
+  __SPIRV_ATOMIC_IADD(AS, Type)                                                \
+  __SPIRV_ATOMIC_ISUB(AS, Type)                                                \
+  __SPIRV_ATOMIC_AND(AS, Type)                                                 \
+  __SPIRV_ATOMIC_OR(AS, Type)                                                  \
+  __SPIRV_ATOMIC_XOR(AS, Type)
+
+#define __SPIRV_ATOMIC_SIGNED(AS, Type)                                        \
+  __SPIRV_ATOMIC_BASE(AS, Type)                                                \
+  __SPIRV_ATOMIC_SMIN(AS, Type)                                                \
+  __SPIRV_ATOMIC_SMAX(AS, Type)
+
+#define __SPIRV_ATOMIC_UNSIGNED(AS, Type)                                      \
+  __SPIRV_ATOMIC_BASE(AS, Type)                                                \
+  __SPIRV_ATOMIC_UMIN(AS, Type)                                                \
+  __SPIRV_ATOMIC_UMAX(AS, Type)
+
+// Helper atomic operations which select correct signed/unsigned version
+// of atomic min/max based on the signed-ness of the type
+#define __SPIRV_ATOMIC_MINMAX(AS, Op)                                          \
+  template <typename T>                                                        \
+  typename std::enable_if<std::is_signed<T>::value, T>::type OpAtomic##Op(     \
+      AS T *Ptr, Scope Scope, MemorySemantics Semantics, T Value) {            \
+    return OpAtomicS##Op(Ptr, Scope, Semantics, Value);                        \
+  }                                                                            \
+  template <typename T>                                                        \
+  typename std::enable_if<!std::is_signed<T>::value, T>::type OpAtomic##Op(    \
+      AS T *Ptr, Scope Scope, MemorySemantics Semantics, T Value) {            \
+    return OpAtomicU##Op(Ptr, Scope, Semantics, Value);                        \
+  }
+
+#define __SPIRV_ATOMICS(macro, Arg) macro(__global, Arg) macro(__local, Arg)
+
+__SPIRV_ATOMICS(__SPIRV_ATOMIC_FLOAT, float)
+__SPIRV_ATOMICS(__SPIRV_ATOMIC_SIGNED, int)
+__SPIRV_ATOMICS(__SPIRV_ATOMIC_SIGNED, long)
+__SPIRV_ATOMICS(__SPIRV_ATOMIC_SIGNED, long long)
+__SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned int)
+__SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long)
+__SPIRV_ATOMICS(__SPIRV_ATOMIC_UNSIGNED, unsigned long long)
+__SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Min)
+__SPIRV_ATOMICS(__SPIRV_ATOMIC_MINMAX, Max)
+
+#else
+
+template <typename dataT>
+extern OpTypeEvent *
+OpGroupAsyncCopyGlobalToLocal(int32_t Scope, dataT *Dest, dataT *Src,
+                              size_t NumElements, size_t Stride,
+                              OpTypeEvent *E) noexcept {
+  for (int i = 0; i < NumElements; i++) {
+    Dest[i] = Src[i * Stride];
+  }
+  // A real instance of the class is not needed, return dummy pointer.
+  return nullptr;
+}
+
+template <typename dataT>
+extern OpTypeEvent *
+OpGroupAsyncCopyLocalToGlobal(int32_t Scope, dataT *Dest, dataT *Src,
+                              size_t NumElements, size_t Stride,
+                              OpTypeEvent *E) noexcept {
+  for (int i = 0; i < NumElements; i++) {
+    Dest[i * Stride] = Src[i];
+  }
+  // A real instance of the class is not needed, return dummy pointer.
+  return nullptr;
+}
+
+#endif // __SYCL_DEVICE_ONLY__
+
+extern void OpControlBarrier(Scope Execution, Scope Memory,
+                             uint32_t Semantics) noexcept;
+
+extern void OpMemoryBarrier(Scope Memory, uint32_t Semantics) noexcept;
+
+extern void OpGroupWaitEvents(int32_t Scope, uint32_t NumEvents,
+                              OpTypeEvent ** WaitEvents) noexcept;
+
+} // namespace __spirv
+} // namespace cl
diff --git a/sycl/include/CL/__spirv/spirv_types.hpp b/sycl/include/CL/__spirv/spirv_types.hpp
new file mode 100644
index 000000000000..8e2d6bfa357b
--- /dev/null
+++ b/sycl/include/CL/__spirv/spirv_types.hpp
@@ -0,0 +1,48 @@
+//===----------- spirv_types.hpp --- SPIRV types -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstdint>
+
+namespace cl {
+namespace __spirv {
+
+// TODO: include the header file with SPIR-V declarations from SPIRV-Headers
+// project.
+enum Scope {
+  CrossDevice = 0,
+  Device = 1,
+  Workgroup = 2,
+  Subgroup = 3,
+  Invocation = 4,
+};
+
+enum MemorySemantics {
+  None = 0x0,
+  Acquire = 0x2,
+  Release = 0x4,
+  AcquireRelease = 0x8,
+  SequentiallyConsistent = 0x10,
+  UniformMemory = 0x40,
+  SubgroupMemory = 0x80,
+  WorkgroupMemory = 0x100,
+  CrossWorkgroupMemory = 0x200,
+  AtomicCounterMemory = 0x400,
+  ImageMemory = 0x800,
+};
+
+// This class does not have definition, it is only predeclared here.
+// The pointers to this class objects can be passed to or returned from
+// SPIRV built-in functions.
+// Only in such cases the class is recognized as SPIRV type OpTypeEvent.
+class OpTypeEvent;
+
+} // namespace __spirv
+} // namespace cl
diff --git a/sycl/include/CL/sycl.hpp b/sycl/include/CL/sycl.hpp
new file mode 100644
index 000000000000..131d1863ba6e
--- /dev/null
+++ b/sycl/include/CL/sycl.hpp
@@ -0,0 +1,47 @@
+//==------------ sycl.hpp - SYCL standard header file ----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/accessor.hpp>
+#include <CL/sycl/atomic.hpp>
+#include <CL/sycl/buffer.hpp>
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/device_selector.hpp>
+#include <CL/sycl/event.hpp>
+#include <CL/sycl/exception.hpp>
+#include <CL/sycl/group.hpp>
+#include <CL/sycl/handler.hpp>
+#include <CL/sycl/id.hpp>
+#include <CL/sycl/image.hpp>
+#include <CL/sycl/intel/sub_group.hpp>
+#include <CL/sycl/item.hpp>
+#include <CL/sycl/kernel.hpp>
+#include <CL/sycl/macro.hpp>
+#include <CL/sycl/math.hpp>
+#include <CL/sycl/multi_ptr.hpp>
+#include <CL/sycl/nd_item.hpp>
+#include <CL/sycl/nd_range.hpp>
+#include <CL/sycl/platform.hpp>
+#include <CL/sycl/pointers.hpp>
+#include <CL/sycl/program.hpp>
+#include <CL/sycl/queue.hpp>
+#include <CL/sycl/range.hpp>
+#include <CL/sycl/types.hpp>
+
+// Do not include RT only function implementations for device code as it leads
+// to problem. Should be finally fixed when we introduce library.
+#ifndef __SYCL_DEVICE_ONLY__
+// The following files are supposed to be included after all SYCL classes
+// processed.
+#include <CL/sycl/detail/scheduler/commands.cpp>
+#include <CL/sycl/detail/scheduler/printers.cpp>
+#include <CL/sycl/detail/scheduler/scheduler.cpp>
+#endif //__SYCL_DEVICE_ONLY__
diff --git a/sycl/include/CL/sycl/access/access.hpp b/sycl/include/CL/sycl/access/access.hpp
new file mode 100644
index 000000000000..2c844e22285a
--- /dev/null
+++ b/sycl/include/CL/sycl/access/access.hpp
@@ -0,0 +1,159 @@
+//==---------------- access.hpp --- SYCL access ----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+namespace cl {
+namespace sycl {
+namespace access {
+
+enum class target {
+  global_buffer = 2014,
+  constant_buffer,
+  local,
+  image,
+  host_buffer,
+  host_image,
+  image_array
+};
+
+enum class mode {
+  read = 1024,
+  write,
+  read_write,
+  discard_write,
+  discard_read_write,
+  atomic
+};
+
+enum class fence_space {
+  local_space,
+  global_space,
+  global_and_local
+};
+
+enum class placeholder { false_t, true_t };
+
+enum class address_space : int {
+  private_space = 0,
+  global_space,
+  constant_space,
+  local_space
+};
+
+}  // namespace access
+
+namespace detail {
+
+constexpr bool isTargetHostAccess(access::target T) {
+  return T == access::target::host_buffer || T == access::target::host_image;
+}
+
+constexpr bool modeNeedsOldData(access::mode m) {
+  return m == access::mode::read || m == access::mode::write ||
+         m == access::mode::read_write || m == access::mode::atomic;
+}
+
+constexpr bool modeWritesNewData(access::mode m) {
+  return m != access::mode::read;
+}
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define SYCL_GLOBAL_AS __global
+#define SYCL_LOCAL_AS __local
+#define SYCL_CONSTANT_AS __constant
+#define SYCL_PRIVATE_AS __private
+#else
+#define SYCL_GLOBAL_AS
+#define SYCL_LOCAL_AS
+#define SYCL_CONSTANT_AS
+#define SYCL_PRIVATE_AS
+#endif
+
+template <typename dataT, access::target accessTarget>
+struct DeviceValueType;
+
+template <typename dataT>
+struct DeviceValueType<dataT, access::target::global_buffer> {
+  using type = SYCL_GLOBAL_AS dataT;
+};
+
+template <typename dataT>
+struct DeviceValueType<dataT, access::target::constant_buffer> {
+  using type = SYCL_CONSTANT_AS dataT;
+};
+
+template <typename dataT>
+struct DeviceValueType<dataT, access::target::local> {
+  using type = SYCL_LOCAL_AS dataT;
+};
+
+template <typename dataT>
+struct DeviceValueType<dataT, access::target::host_buffer> {
+  using type = dataT;
+};
+
+template <typename ElementType, access::address_space addressSpace>
+struct PtrValueType;
+
+template <typename ElementType>
+struct PtrValueType<ElementType, access::address_space::private_space> {
+  using type = SYCL_PRIVATE_AS ElementType;
+};
+
+template <typename ElementType>
+struct PtrValueType<ElementType, access::address_space::global_space> {
+  using type = SYCL_GLOBAL_AS ElementType;
+};
+
+template <typename ElementType>
+struct PtrValueType<ElementType, access::address_space::constant_space> {
+  using type = SYCL_CONSTANT_AS ElementType;
+};
+
+template <typename ElementType>
+struct PtrValueType<ElementType, access::address_space::local_space> {
+  using type = SYCL_LOCAL_AS ElementType;
+};
+
+template <class T>
+struct remove_AS {
+  typedef T type;
+};
+
+#ifdef __SYCL_DEVICE_ONLY__
+template <class T>
+struct remove_AS<SYCL_GLOBAL_AS T> {
+  typedef T type;
+};
+
+template <class T>
+struct remove_AS<SYCL_PRIVATE_AS T> {
+  typedef T type;
+};
+
+template <class T>
+struct remove_AS<SYCL_LOCAL_AS T> {
+  typedef T type;
+};
+
+template <class T>
+struct remove_AS<SYCL_CONSTANT_AS T> {
+  typedef T type;
+};
+#endif
+
+#undef SYCL_GLOBAL_AS
+#undef SYCL_LOCAL_AS
+#undef SYCL_CONSTANT_AS
+#undef SYCL_PRIVATE_AS
+
+} // namespace detail
+
+}  // namespace sycl
+}  // namespace cl
diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp
new file mode 100644
index 000000000000..0f865ccd40aa
--- /dev/null
+++ b/sycl/include/CL/sycl/accessor.hpp
@@ -0,0 +1,848 @@
+//==--------- accessor.hpp --- SYCL accessor -------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <type_traits>
+
+#include <CL/sycl/atomic.hpp>
+#include <CL/sycl/buffer.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/handler.hpp>
+#include <CL/sycl/id.hpp>
+#include <CL/sycl/pointers.hpp>
+
+namespace cl {
+namespace sycl {
+// TODO: 4.3.2 Implement common reference semantics
+namespace detail {
+
+template <typename dataT, int dimensions, access::mode accessMode,
+          access::target accessTarget, access::placeholder isPlaceholder>
+class accessor_base;
+
+template <int accessorDim, typename dataT, int dimensions,
+          access::mode accessMode, access::target accessTarget,
+          access::placeholder isPlaceholder>
+class subscript_obj {
+  using accessor_t = accessor_base<dataT, accessorDim, accessMode, accessTarget,
+                                   isPlaceholder>;
+
+  // TODO: Remove reference here as subscript_obj, can potentially outlive
+  // the accessor. There is no spec-defined usecase, so leave it for now.
+  const accessor_t &accRef;
+  cl::sycl::id<accessorDim> ids;
+
+public:
+  subscript_obj(const accessor_t &acc, cl::sycl::id<accessorDim> &indexes)
+      : accRef(acc), ids(indexes) {}
+
+  INLINE_IF_DEVICE subscript_obj<accessorDim, dataT, dimensions - 1, accessMode, accessTarget,
+                isPlaceholder>
+  operator[](size_t index) {
+    ids[accessorDim - dimensions] = index;
+    return subscript_obj<accessorDim, dataT, dimensions - 1, accessMode,
+                         accessTarget, isPlaceholder>(accRef, ids);
+  }
+};
+
+template <int accessorDim, typename dataT, access::mode accessMode,
+          access::target accessTarget, access::placeholder isPlaceholder>
+class subscript_obj<accessorDim, dataT, 1, accessMode, accessTarget,
+                    isPlaceholder> {
+  using accessor_t = accessor_base<dataT, accessorDim, accessMode, accessTarget,
+                                   isPlaceholder>;
+
+  const accessor_t &accRef;
+  cl::sycl::id<accessorDim> ids;
+
+public:
+  subscript_obj(const accessor_t &acc, cl::sycl::id<accessorDim> &indexes)
+      : accRef(acc), ids(indexes) {}
+
+  INLINE_IF_DEVICE dataT &operator[](size_t index) {
+    ids[accessorDim - 1] = index;
+    return accRef.__impl()->Data[getOffsetForId(
+      accRef.__impl()->Range, ids, accRef.__impl()->Offset)];
+  }
+};
+
+template <int accessorDim, typename dataT,
+          access::target accessTarget, access::placeholder isPlaceholder>
+class subscript_obj<accessorDim, dataT, 1, access::mode::read, accessTarget,
+                    isPlaceholder> {
+  using accessor_t = accessor_base<dataT, accessorDim, access::mode::read,
+                                   accessTarget, isPlaceholder>;
+
+  const accessor_t &accRef;
+  cl::sycl::id<accessorDim> ids;
+
+public:
+  subscript_obj(const accessor_t &acc, cl::sycl::id<accessorDim> &indexes)
+      : accRef(acc), ids(indexes) {}
+
+  INLINE_IF_DEVICE typename detail::remove_AS<dataT>::type
+  operator[](size_t index) {
+    ids[accessorDim - 1] = index;
+    return accRef.__impl()->Data[getOffsetForId(
+      accRef.__impl()->Range, ids, accRef.__impl()->Offset)];
+  }
+};
+
+/// Specializations of accessor_impl define data fields for accessor.
+/// There is no default implementation for the class. This class is
+/// not a root of the class hierarchy, because it should be
+/// initialized at the bottom of the hierarchy.
+template <typename dataT, int dimensions, access::mode accessMode,
+          access::target accessTarget, access::placeholder isPlaceholder,
+          typename voidT = void>
+struct accessor_impl;
+
+#define SYCL_ACCESSOR_IMPL(CONDITION)                                          \
+  template <typename dataT, int dimensions, access::mode accessMode,           \
+            access::target accessTarget, access::placeholder isPlaceholder>    \
+  struct accessor_impl<dataT, dimensions, accessMode, accessTarget,            \
+                       isPlaceholder,                                          \
+                       typename std::enable_if<(CONDITION)>::type>
+
+/// Implementation of host accessor providing access to a single element.
+/// Available when (dimensions == 0).
+SYCL_ACCESSOR_IMPL(isTargetHostAccess(accessTarget) && dimensions == 0) {
+  dataT *Data;
+  accessor_impl(dataT *Data) : Data(Data) {}
+
+  // Returns the number of accessed elements.
+  INLINE_IF_DEVICE size_t get_count() const { return 1; }
+};
+
+/// Implementation of host accessor.
+/// Available when (dimensions > 0).
+SYCL_ACCESSOR_IMPL(isTargetHostAccess(accessTarget) && dimensions > 0) {
+  dataT *Data;
+  range<dimensions> Range;
+  id<dimensions> Offset;
+
+  accessor_impl(dataT *Data, range<dimensions> Range,
+                id<dimensions> Offset = {})
+      : Data(Data), Range(Range), Offset(Offset) {}
+
+  // Returns the number of accessed elements.
+  INLINE_IF_DEVICE size_t get_count() const { return Range.size(); }
+};
+
+/// Implementation of device (kernel) accessor providing access to a single
+/// element. Available only when (dimensions == 0).
+/// There is no way to tell at compile time if this accessor will be used
+/// on OpenCL device or on host. So, the class should fit both variants.
+SYCL_ACCESSOR_IMPL(!isTargetHostAccess(accessTarget) &&
+                   accessTarget != access::target::local &&
+                   dimensions == 0) {
+  // This field must be the first to guarantee that it's safe to use
+  // reinterpret casting while setting kernel arguments in order to get cl_mem
+  // value from the buffer regardless of the accessor's dimensionality.
+#ifndef __SYCL_DEVICE_ONLY__
+  detail::buffer_impl<dataT, 1> *m_Buf = nullptr;
+
+#else
+  char padding[sizeof(detail::buffer_impl<dataT, dimensions> *)];
+#endif // __SYCL_DEVICE_ONLY__
+
+  dataT *Data;
+
+  // Device accessors must be associated with a command group handler.
+  // The handler though can be nullptr at the creation point if the
+  // accessor is a placeholder accessor.
+  accessor_impl(dataT *Data, handler *Handler = nullptr)
+      : Data(Data)
+  {}
+
+  // Returns the number of accessed elements.
+  INLINE_IF_DEVICE size_t get_count() const { return 1; }
+
+  static_assert(
+      std::is_same<typename DeviceValueType<dataT, accessTarget>::type,
+                   dataT>::value,
+      "The type should have been adjusted before propagating through "
+      "class hierarchy");
+};
+
+/// Implementation of device (kernel) accessor. There is no way to
+/// tell at compile time if this accessor will be used on OpenCL
+/// device or on host. So, the class should fit both variants.
+/// Available only when (dimensions > 0).
+SYCL_ACCESSOR_IMPL(!isTargetHostAccess(accessTarget) &&
+                   accessTarget != access::target::local &&
+                   dimensions > 0) {
+  // This field must be the first to guarantee that it's safe to use
+  // reinterpret casting while setting kernel arguments in order to get cl_mem
+  // value from the buffer regardless of the accessor's dimensionality.
+#ifndef __SYCL_DEVICE_ONLY__
+  detail::buffer_impl<dataT, dimensions> *m_Buf = nullptr;
+#else
+  char padding[sizeof(detail::buffer_impl<dataT, dimensions> *)];
+#endif // __SYCL_DEVICE_ONLY__
+
+  dataT *Data;
+  range<dimensions> Range;
+  id<dimensions> Offset;
+
+  // Device accessors must be associated with a command group handler.
+  // The handler though can be nullptr at the creation point if the
+  // accessor is a placeholder accessor.
+  accessor_impl(dataT *Data, range<dimensions> Range,
+                handler *Handler = nullptr, id<dimensions> Offset = {})
+      : Data(Data), Range(Range), Offset(Offset)
+  {}
+
+  // Returns the number of accessed elements.
+  INLINE_IF_DEVICE size_t get_count() const { return Range.size(); }
+
+  static_assert(
+      std::is_same<typename DeviceValueType<dataT, accessTarget>::type,
+                   dataT>::value,
+      "The type should have been adjusted before propagating through "
+      "class hierarchy");
+};
+
+/// Implementation of local accessor providing access to a single element.
+/// Available only when (dimensions == 0).
+SYCL_ACCESSOR_IMPL(accessTarget == access::target::local &&
+                   dimensions == 0) {
+  // This field must be the first to guarantee that it's safe to use
+  // reinterpret casting while setting kernel arguments in order to get size
+  // value from the accessor regardless of its dimensionality.
+  size_t ByteSize;
+
+#ifndef __SYCL_DEVICE_ONLY__
+  shared_ptr_class<vector_class<dataT>> dataBuf;
+#else
+  char padding[sizeof(shared_ptr_class<vector_class<dataT>>)];
+#endif
+
+  dataT *Data;
+
+  accessor_impl(handler * Handler)
+      : ByteSize(sizeof(dataT))
+  {
+#ifndef __SYCL_DEVICE_ONLY__
+    assert(Handler != nullptr && "Handler is nullptr");
+    if (Handler->is_host()) {
+      dataBuf = std::make_shared<vector_class<dataT>>(1);
+      Data = dataBuf->data();
+    }
+#endif
+  }
+
+  // Returns the number of accessed elements.
+  INLINE_IF_DEVICE size_t get_count() const { return 1; }
+
+  static_assert(
+      std::is_same<typename DeviceValueType<dataT, accessTarget>::type,
+                   dataT>::value,
+      "The type should have been adjusted before propagating through "
+      "class hierarchy");
+};
+
+/// Implementation of local accessor.
+/// Available only when (dimensions > 0).
+SYCL_ACCESSOR_IMPL(accessTarget == access::target::local &&
+                   dimensions > 0) {
+  // This field must be the first to guarantee that it's safe to use
+  // reinterpret casting while setting kernel arguments in order to get size
+  // value from the accessor regardless of its dimensionality.
+  size_t ByteSize;
+
+#ifndef __SYCL_DEVICE_ONLY__
+  shared_ptr_class<vector_class<dataT>> dataBuf;
+#else
+  char padding[sizeof(shared_ptr_class<vector_class<dataT>>)];
+#endif
+
+  dataT *Data;
+  range<dimensions> Range;
+  // TODO delete it when accessor class was remade
+  // Offset field is not need for local accessor, but this field is now used
+  // in the inheritance hierarchy. Getting rid of this field will cause
+  // duplication and complication of the code even more.
+  id<dimensions> Offset;
+
+  accessor_impl(range<dimensions> Range, handler * Handler) : Range(Range),
+      ByteSize(Range.size() * sizeof(dataT))
+  {
+#ifndef __SYCL_DEVICE_ONLY__
+    assert(Handler != nullptr && "Handler is nullptr");
+    if (Handler->is_host()) {
+      dataBuf = std::make_shared<vector_class<dataT>>(Range.size());
+      Data = dataBuf->data();
+    }
+#endif
+  }
+
+  // Returns the number of accessed elements.
+  INLINE_IF_DEVICE size_t get_count() const { return Range.size(); }
+
+  static_assert(
+      std::is_same<typename DeviceValueType<dataT, accessTarget>::type,
+                   dataT>::value,
+      "The type should have been adjusted before propagating through "
+      "class hierarchy");
+};
+
+/// Base class for all accessor specializations.
+template <typename dataT, int dimensions, access::mode accessMode,
+          access::target accessTarget, access::placeholder isPlaceholder>
+class accessor_base {
+protected:
+  template <int, typename, int, access::mode, access::target,
+            access::placeholder>
+  friend class subscript_obj;
+  friend class ::cl::sycl::simple_scheduler::Node;
+  friend class ::cl::sycl::simple_scheduler::Scheduler;
+  using _ImplT =
+      accessor_impl<dataT, dimensions, accessMode, accessTarget, isPlaceholder>;
+
+  INLINE_IF_DEVICE const _ImplT *__impl() const {
+    return reinterpret_cast<const _ImplT *>(this);
+  }
+
+  INLINE_IF_DEVICE _ImplT *__impl() { return reinterpret_cast<_ImplT *>(this); }
+
+  static_assert(
+      std::is_same<typename DeviceValueType<dataT, accessTarget>::type,
+                   dataT>::value,
+      "The type should have been adjusted before propagating through "
+      "class hierarchy");
+};
+
+// The macro is used to conditionally define methods of accessor class
+// by wrapping them into a structure that is non-empty only if the
+// condition is met.
+#define SYCL_ACCESSOR_SUBCLASS(TAG, PARENT, CONDITION)                         \
+  template <typename dataT, int dimensions, access::mode accessMode,           \
+            access::target accessTarget, access::placeholder isPlaceholder,    \
+            typename voidT = void>                                             \
+  struct TAG : ::cl::sycl::detail::PARENT<dataT, dimensions, accessMode,       \
+                                          accessTarget, isPlaceholder> {};     \
+                                                                               \
+  template <typename dataT, int dimensions, access::mode accessMode,           \
+            access::target accessTarget, access::placeholder isPlaceholder>    \
+  struct TAG<dataT, dimensions, accessMode, accessTarget, isPlaceholder,       \
+             typename std::enable_if<(CONDITION)>::type>                       \
+      : ::cl::sycl::detail::PARENT<dataT, dimensions, accessMode,              \
+                                   accessTarget, isPlaceholder>
+
+SYCL_ACCESSOR_SUBCLASS(accessor_common, accessor_base, true /* always */) {
+  // Returns true if the current accessor is a placeholder accessor.
+  INLINE_IF_DEVICE constexpr bool is_placeholder() const {
+    return isPlaceholder == access::placeholder::true_t;
+  }
+
+  // Returns the size of the accessed memory in bytes.
+  INLINE_IF_DEVICE size_t get_size() const { return this->get_count() * sizeof(dataT); }
+
+  // Returns the number of accessed elements.
+  INLINE_IF_DEVICE size_t get_count() const { return this->__impl()->get_count(); }
+
+  template <int Dimensions = dimensions> INLINE_IF_DEVICE
+  typename std::enable_if<(Dimensions > 0), range<Dimensions>>::type
+  get_range() const { return this->__impl()->Range; }
+
+  template <int Dimensions = dimensions> INLINE_IF_DEVICE
+  typename std::enable_if<(Dimensions > 0), id<Dimensions>>::type
+  get_offset() const { return this->__impl()->Offset; }
+};
+
+SYCL_ACCESSOR_SUBCLASS(accessor_opdata_w, accessor_common,
+                       (accessMode == access::mode::write ||
+                        accessMode == access::mode::read_write ||
+                        accessMode == access::mode::discard_write ||
+                        accessMode == access::mode::discard_read_write) &&
+                       dimensions == 0) {
+  INLINE_IF_DEVICE operator dataT &() const {
+    return this->__impl()->Data[0];
+  }
+};
+
+SYCL_ACCESSOR_SUBCLASS(accessor_subscript_wn, accessor_opdata_w,
+                       (accessMode == access::mode::write ||
+                        accessMode == access::mode::read_write ||
+                        accessMode == access::mode::discard_write ||
+                        accessMode == access::mode::discard_read_write) &&
+                       dimensions > 0) {
+  dataT &operator[](id<dimensions> index) const {
+    return this->__impl()->Data[getOffsetForId(
+      this->get_range(), index, this->get_offset())];
+  }
+
+  subscript_obj<dimensions, dataT, dimensions - 1, accessMode, accessTarget,
+              isPlaceholder>
+  INLINE_IF_DEVICE operator[](size_t index) const {
+    id<dimensions> ids;
+    ids[0] = index;
+    return subscript_obj<dimensions, dataT, dimensions - 1, accessMode,
+                         accessTarget, isPlaceholder>(*this, ids);
+  }
+};
+
+SYCL_ACCESSOR_SUBCLASS(accessor_subscript_w, accessor_subscript_wn,
+                       (accessMode == access::mode::write ||
+                        accessMode == access::mode::read_write ||
+                        accessMode == access::mode::discard_write ||
+                        accessMode == access::mode::discard_read_write) &&
+                       dimensions == 1) {
+  // The tricky part here is that there is no function overloading
+  // between different scopes in C++. That is, operator[] defined in a
+  // child class hides any operator[] defined in any of the parent
+  // classes. That's why operator[] defined in accessor_subscript_wn
+  // is not visible here and we have to define
+  // operator[](id<dimensions>) once again.
+  INLINE_IF_DEVICE dataT &operator[](id<dimensions> index) const {
+    return this->operator[](
+      getOffsetForId(this->get_range(), index, this->get_offset()));
+  }
+  INLINE_IF_DEVICE dataT &operator[](size_t index) const {
+    return this->__impl()->Data[index];
+  }
+};
+
+SYCL_ACCESSOR_SUBCLASS(accessor_opdata_r, accessor_subscript_w,
+                       accessMode == access::mode::read && dimensions == 0) {
+  using PureType = typename detail::remove_AS<dataT>::type;
+  operator PureType() const {
+    return this->__impl()->Data[0];
+  }
+};
+
+SYCL_ACCESSOR_SUBCLASS(accessor_subscript_rn, accessor_opdata_r,
+                       accessMode == access::mode::read && dimensions > 0) {
+  typename detail::remove_AS<dataT>::type
+  operator[](id<dimensions> index) const {
+    return this->__impl()->Data[getOffsetForId(
+      this->get_range(), index, this->get_offset())];
+  }
+
+  subscript_obj<dimensions, dataT, dimensions - 1, accessMode, accessTarget,
+              isPlaceholder>
+  operator[](size_t index) const {
+    id<dimensions> ids;
+    ids[0] = index;
+    return subscript_obj<dimensions, dataT, dimensions - 1, accessMode,
+                         accessTarget, isPlaceholder>(*this, ids);
+  }
+};
+
+SYCL_ACCESSOR_SUBCLASS(accessor_subscript_r, accessor_subscript_rn,
+                       accessMode == access::mode::read && dimensions == 1) {
+  typename detail::remove_AS<dataT>::type
+  operator[](id<dimensions> index) const {
+    return this->operator[](
+      getOffsetForId(this->get_range(), index, this->get_offset()));
+  }
+  typename detail::remove_AS<dataT>::type
+  operator[](size_t index) const {
+    return this->__impl()->Data[index];
+  }
+};
+
+template <access::target accessTarget> struct getAddressSpace {
+  constexpr static cl::sycl::access::address_space value =
+      cl::sycl::access::address_space::global_space;
+};
+
+template <> struct getAddressSpace<access::target::local> {
+  constexpr static cl::sycl::access::address_space value =
+      cl::sycl::access::address_space::local_space;
+};
+
+// Available when: accessMode == access::mode::atomic && dimensions == 0
+SYCL_ACCESSOR_SUBCLASS(accessor_subscript_atomic_eq0, accessor_subscript_r,
+                       accessMode == access::mode::atomic && dimensions == 0) {
+  using PureType = typename detail::remove_AS<dataT>::type;
+  constexpr static access::address_space addressSpace =
+      getAddressSpace<accessTarget>::value;
+  operator atomic<PureType, addressSpace>() const {
+    return atomic<PureType, addressSpace>(
+        multi_ptr<PureType, addressSpace>(&(this->__impl()->Data[0])));
+  }
+};
+
+// Available when: accessMode == access::mode::atomic && dimensions > 0
+SYCL_ACCESSOR_SUBCLASS(accessor_subscript_atomic_gt0,
+                       accessor_subscript_atomic_eq0,
+                       accessMode == access::mode::atomic && dimensions > 0) {
+  using PureType = typename detail::remove_AS<dataT>::type;
+  constexpr static access::address_space addressSpace =
+      getAddressSpace<accessTarget>::value;
+  atomic<PureType, addressSpace> operator[](id<dimensions> index) const {
+    return atomic<PureType, addressSpace>(
+        multi_ptr<PureType, addressSpace>(&(this->__impl()->Data[getOffsetForId(
+            this->__impl()->Range, index, this->__impl()->Offset)])));
+  }
+};
+
+// Available only when: accessMode == access::mode::atomic && dimensions == 1
+SYCL_ACCESSOR_SUBCLASS(accessor_subscript_atomic_eq1,
+                       accessor_subscript_atomic_gt0,
+                       accessMode == access::mode::atomic && dimensions == 1) {
+  using PureType = typename detail::remove_AS<dataT>::type;
+  constexpr static access::address_space addressSpace =
+      getAddressSpace<accessTarget>::value;
+  atomic<PureType, addressSpace> operator[](size_t index) const {
+    return atomic<PureType, addressSpace>(
+        multi_ptr<PureType, addressSpace>(&(this->__impl()->Data[index])));
+  }
+};
+
+// TODO:
+// /* Available only when: dimensions > 1 */
+// __unspecified__ &operator[](size_t index) const;
+
+SYCL_ACCESSOR_SUBCLASS(accessor_pointer, accessor_subscript_atomic_eq1, true) {
+  /* Available only when: accessTarget == access::target::host_buffer */
+  template <typename DataT = typename detail::remove_AS<dataT>::type,
+            access::target AccessTarget = accessTarget>
+  typename std::enable_if<(AccessTarget == access::target::host_buffer),
+                          dataT *>::type
+  get_pointer() const {
+    return this->__impl()->Data;
+  }
+  /* Available only when: accessTarget == access::target::global_buffer */
+  template <typename DataT = typename detail::remove_AS<dataT>::type,
+            access::target AccessTarget = accessTarget>
+  typename std::enable_if<(AccessTarget == access::target::global_buffer),
+                          global_ptr<DataT>>::type
+  get_pointer() const {
+    return global_ptr<DataT>(this->__impl()->Data);
+  }
+  /* Available only when: accessTarget == access::target::constant_buffer */
+  template <typename DataT = typename detail::remove_AS<dataT>::type,
+            access::target AccessTarget = accessTarget>
+  typename std::enable_if<(AccessTarget == access::target::constant_buffer),
+                          constant_ptr<DataT>>::type
+  get_pointer() const {
+    return constant_ptr<DataT>(this->__impl()->Data);
+  }
+  /* Available only when: accessTarget == access::target::local */
+  template <typename DataT = typename detail::remove_AS<dataT>::type,
+            access::target AccessTarget = accessTarget>
+  typename std::enable_if<(AccessTarget == access::target::local),
+                          local_ptr<DataT>>::type
+  get_pointer() const {
+    return local_ptr<DataT>(this->__impl()->Data);
+  }
+};
+
+} // namespace detail
+
+//
+// Actual definition of sycl::accessor class.
+//
+template <typename dataT, int dimensions, access::mode accessMode,
+          access::target accessTarget = access::target::global_buffer,
+          access::placeholder isPlaceholder = access::placeholder::false_t>
+class accessor
+    : public detail::accessor_pointer<
+          typename detail::DeviceValueType<dataT, accessTarget>::type,
+          dimensions, accessMode, accessTarget, isPlaceholder> {
+  using _ValueType =
+      typename detail::DeviceValueType<dataT, accessTarget>::type;
+  using _ImplT = detail::accessor_impl<_ValueType, dimensions, accessMode,
+                                       accessTarget, isPlaceholder>;
+
+  // Make sure Impl field is the first in the class, so that it is
+  // safe to reinterpret a pointer to accessor as a pointer to the
+  // implementation.
+  _ImplT __impl;
+
+  INLINE_IF_DEVICE void __init(_ValueType *Ptr, range<dimensions> Range,
+      id<dimensions> Offset) {
+    __impl.Data = Ptr;
+    __impl.Range = Range;
+    __impl.Offset = Offset;
+  }
+
+public:
+  using value_type = dataT;
+  using reference = dataT &;
+  using const_reference = const dataT &;
+
+  // buffer accessor ctor #1
+  //   accessor(buffer<dataT, 1> &);
+  //
+  // Available only when:
+  //   ((isPlaceholder == access::placeholder::false_t &&
+  //     accessTarget == access::target::host_buffer) ||
+  //    (isPlaceholder == access::placeholder::true_t  &&
+  //     (accessTarget == access::target::global_buffer||
+  //      accessTarget == access::target::constant_buffer))) &&
+  //   dimensions == 0
+  template <typename DataT = dataT, int Dimensions = dimensions,
+            access::mode AccessMode = accessMode,
+            access::target AccessTarget = accessTarget,
+            access::placeholder IsPlaceholder = isPlaceholder>
+  accessor(typename std::enable_if<
+           (((IsPlaceholder == access::placeholder::false_t &&
+              AccessTarget == access::target::host_buffer) ||
+             (IsPlaceholder == access::placeholder::true_t  &&
+               (AccessTarget == access::target::global_buffer ||
+                AccessTarget == access::target::constant_buffer))) &&
+            Dimensions == 0),
+           buffer<DataT, 1>>::type &bufferRef)
+      : __impl(detail::getSyclObjImpl(bufferRef)->BufPtr) {
+    auto BufImpl = detail::getSyclObjImpl(bufferRef);
+    if (AccessTarget == access::target::host_buffer) {
+      if (BufImpl->OpenCLInterop) {
+        throw cl::sycl::runtime_error(
+            "Host access to interoperability buffer is not allowed");
+      } else {
+        simple_scheduler::Scheduler::getInstance()
+            .copyBack<AccessMode, AccessTarget>(*BufImpl);
+      }
+    }
+    if (BufImpl->OpenCLInterop && !BufImpl->isValidAccessToMem(accessMode)) {
+      throw cl::sycl::runtime_error(
+          "Access mode is incompatible with opencl memory object of the "
+          "interoperability buffer");
+    }
+  }
+
+  // buffer accessor ctor #2:
+  //   accessor(buffer<dataT, 1> &, handler &);
+  //
+  // Available only when:
+  //   isPlaceholder == access::placeholder::false_t &&
+  //   (accessTarget == access::target::global_buffer ||
+  //    accessTarget == access::target::constant_buffer) &&
+  //   dimensions == 0
+  template <typename DataT = dataT, int Dimensions = dimensions,
+            access::mode AccessMode = accessMode,
+            access::target AccessTarget = accessTarget,
+            access::placeholder IsPlaceholder = isPlaceholder>
+  accessor(typename std::enable_if<
+           (IsPlaceholder == access::placeholder::false_t &&
+            (AccessTarget == access::target::global_buffer ||
+             AccessTarget == access::target::constant_buffer) &&
+             Dimensions == 0),
+           buffer<DataT, 1>>::type &bufferRef,
+           handler &commandGroupHandlerRef)
+#ifdef __SYCL_DEVICE_ONLY__
+      ; // This ctor can't be used in device code, so no need to define it.
+#else // !__SYCL_DEVICE_ONLY__
+      : __impl(detail::getSyclObjImpl(bufferRef)->BufPtr,
+               detail::getSyclObjImpl(bufferRef)->Range,
+               &commandGroupHandlerRef) {
+    auto BufImpl = detail::getSyclObjImpl(bufferRef);
+    if (BufImpl->OpenCLInterop && !BufImpl->isValidAccessToMem(accessMode)) {
+      throw cl::sycl::runtime_error(
+          "Access mode is incompatible with opencl memory object of the "
+          "interoperability buffer");
+    }
+    commandGroupHandlerRef.AddBufDep<AccessMode, AccessTarget>(*BufImpl);
+    __impl.m_Buf = BufImpl.get();
+  }
+#endif // !__SYCL_DEVICE_ONLY__
+
+  // buffer accessor ctor #3:
+  //   accessor(buffer &);
+  //
+  // Available only when:
+  //   ((isPlaceholder == access::placeholder::false_t &&
+  //     accessTarget == access::target::host_buffer) ||
+  //    (isPlaceholder == access::placeholder::true_t &&
+  //     (accessTarget == access::target::global_buffer ||
+  //      accessTarget == access::target::constant_buffer))) &&
+  //   dimensions > 0)
+  template <typename DataT = dataT, int Dimensions = dimensions,
+            access::mode AccessMode = accessMode,
+            access::target AccessTarget = accessTarget,
+            access::placeholder IsPlaceholder = isPlaceholder>
+  accessor(typename std::enable_if<
+           (((IsPlaceholder == access::placeholder::false_t &&
+              AccessTarget == access::target::host_buffer) ||
+             (IsPlaceholder == access::placeholder::true_t &&
+              (AccessTarget == access::target::global_buffer ||
+               AccessTarget == access::target::constant_buffer))) &&
+            Dimensions > 0),
+           buffer<DataT, Dimensions>>::type &bufferRef)
+      : __impl(detail::getSyclObjImpl(bufferRef)->BufPtr,
+               detail::getSyclObjImpl(bufferRef)->Range) {
+    auto BufImpl = detail::getSyclObjImpl(bufferRef);
+    if (AccessTarget == access::target::host_buffer) {
+      if (BufImpl->OpenCLInterop) {
+        throw cl::sycl::runtime_error(
+            "Host access to interoperability buffer is not allowed");
+      } else {
+        simple_scheduler::Scheduler::getInstance()
+            .copyBack<AccessMode, AccessTarget>(*BufImpl);
+      }
+    }
+    if (BufImpl->OpenCLInterop && !BufImpl->isValidAccessToMem(accessMode)) {
+      throw cl::sycl::runtime_error(
+          "Access mode is incompatible with opencl memory object of the "
+          "interoperability buffer");
+    }
+  }
+
+  // buffer ctor #4:
+  //   accessor(buffer &, handler &);
+  //
+  // Available only when:
+  //   isPlaceholder == access::placeholder::false_t &&
+  //   (accessTarget == access::target::global_buffer ||
+  //    accessTarget == access::target::constant_buffer) &&
+  //   dimensions > 0
+  template <typename DataT = dataT, int Dimensions = dimensions,
+            access::mode AccessMode = accessMode,
+            access::target AccessTarget = accessTarget,
+            access::placeholder IsPlaceholder = isPlaceholder>
+  accessor(typename std::enable_if<
+           (IsPlaceholder == access::placeholder::false_t &&
+            (AccessTarget == access::target::global_buffer ||
+             AccessTarget == access::target::constant_buffer) &&
+            Dimensions > 0),
+           buffer<DataT, Dimensions>>::type &bufferRef,
+           handler &commandGroupHandlerRef)
+#ifdef __SYCL_DEVICE_ONLY__
+      ; // This ctor can't be used in device code, so no need to define it.
+#else
+      : __impl(detail::getSyclObjImpl(bufferRef)->BufPtr,
+               detail::getSyclObjImpl(bufferRef)->Range,
+               &commandGroupHandlerRef) {
+    auto BufImpl = detail::getSyclObjImpl(bufferRef);
+    if (BufImpl->OpenCLInterop && !BufImpl->isValidAccessToMem(accessMode)) {
+      throw cl::sycl::runtime_error(
+          "Access mode is incompatible with opencl memory object of the "
+          "interoperability buffer");
+    }
+    commandGroupHandlerRef.AddBufDep<AccessMode, AccessTarget>(*BufImpl);
+    __impl.m_Buf = BufImpl.get();
+  }
+#endif
+
+  // accessor ctor #5:
+  //   accessor(buffer &, range Range, id Offset = {});
+  //
+  // Available only when:
+  //   (isPlaceholder == access::placeholder::false_t &&
+  //    accessTarget == access::target::host_buffer) ||
+  //   (isPlaceholder == access::placeholder::true_t &&
+  //    (accessTarget == access::target::global_buffer ||
+  //     accessTarget == access::target::constant_buffer) &&
+  //    dimensions > 0)
+  template <typename DataT = dataT, int Dimensions = dimensions,
+            access::mode AccessMode = accessMode,
+            access::target AccessTarget = accessTarget,
+            access::placeholder IsPlaceholder = isPlaceholder>
+  accessor(typename std::enable_if<
+           ((IsPlaceholder == access::placeholder::false_t &&
+             AccessTarget == access::target::host_buffer) ||
+            (IsPlaceholder == access::placeholder::true_t &&
+             (AccessTarget == access::target::global_buffer ||
+              AccessTarget == access::target::constant_buffer) &&
+             Dimensions > 0)),
+           buffer<DataT, Dimensions>>::type &bufferRef,
+           range<Dimensions> Range,
+           id<Dimensions> Offset = {}
+          )
+#ifdef __SYCL_DEVICE_ONLY__
+      ; // This ctor can't be used in device code, so no need to define it.
+#else // !__SYCL_DEVICE_ONLY__
+      : __impl(detail::getSyclObjImpl(bufferRef)->BufPtr, Range, Offset) {
+    auto BufImpl = detail::getSyclObjImpl(bufferRef);
+    if (AccessTarget == access::target::host_buffer) {
+      if (BufImpl->OpenCLInterop) {
+        throw cl::sycl::runtime_error(
+            "Host access to interoperability buffer is not allowed");
+      } else {
+        simple_scheduler::Scheduler::getInstance()
+            .copyBack<AccessMode, AccessTarget>(*BufImpl);
+      }
+    }
+    if (BufImpl->OpenCLInterop && !BufImpl->isValidAccessToMem(accessMode)) {
+      throw cl::sycl::runtime_error(
+          "Access mode is incompatible with opencl memory object of the "
+          "interoperability buffer");
+    }
+  }
+#endif // !__SYCL_DEVICE_ONLY__
+
+  // buffer ctor #6:
+  //   accessor(buffer &, handler &, range Range, id Offset);
+  //
+  // Available only when:
+  //   isPlaceholder == access::placeholder::false_t &&
+  //   (accessTarget == access::target::global_buffer ||
+  //    accessTarget == access::target::constant_buffer) &&
+  //   dimensions > 0
+  template <typename DataT = dataT, int Dimensions = dimensions,
+            access::mode AccessMode = accessMode,
+            access::target AccessTarget = accessTarget,
+            access::placeholder IsPlaceholder = isPlaceholder>
+  accessor(typename std::enable_if<
+           (IsPlaceholder == access::placeholder::false_t &&
+            (AccessTarget == access::target::global_buffer ||
+             AccessTarget == access::target::constant_buffer) &&
+            Dimensions > 0),
+           buffer<DataT, Dimensions>>::type &bufferRef,
+           handler &commandGroupHandlerRef,
+           range<Dimensions> Range,
+           id<Dimensions> Offset = {}
+          )
+#ifdef __SYCL_DEVICE_ONLY__
+      ; // This ctor can't be used in device code, so no need to define it.
+#else // !__SYCL_DEVICE_ONLY__
+      : __impl(detail::getSyclObjImpl(bufferRef)->BufPtr, Range,
+               &commandGroupHandlerRef, Offset) {
+    auto BufImpl = detail::getSyclObjImpl(bufferRef);
+    if (BufImpl->OpenCLInterop && !BufImpl->isValidAccessToMem(accessMode)) {
+      throw cl::sycl::runtime_error(
+          "Access mode is incompatible with opencl memory object of the "
+          "interoperability buffer");
+    }
+    commandGroupHandlerRef.AddBufDep<AccessMode, AccessTarget>(*BufImpl);
+    __impl.m_Buf = BufImpl.get();
+  }
+#endif // !__SYCL_DEVICE_ONLY__
+
+  // TODO:
+  // local accessor ctor #1
+  // accessor(handler &);
+  // Available only when:
+  //   AccessTarget == access::target::local && Dimensions == 0
+  //
+  // template <typename DataT = dataT, int Dimensions = dimensions,
+  //           access::mode AccessMode = accessMode,
+  //           access::target AccessTarget = accessTarget,
+  //           access::placeholder IsPlaceholder = isPlaceholder>
+  // accessor(typename std::enable_if<(AccessTarget == access::target::local &&
+  // Dimensions == 0), handler>::type &commandGroupHandlerRef);
+
+
+  // local accessor ctor #2
+  // accessor(range allocationSize, handler &);
+  // Available only when:
+  //   AccessTarget == access::target::local && Dimensions => 0
+  template <typename DataT = dataT, int Dimensions = dimensions,
+            access::mode AccessMode = accessMode,
+            access::target AccessTarget = accessTarget,
+            access::placeholder IsPlaceholder = isPlaceholder>
+  accessor(typename std::enable_if<(AccessTarget == access::target::local &&
+                                    Dimensions > 0),
+                                   range<Dimensions>>::type allocationSize,
+           handler &commandGroupHandlerRef)
+      : __impl(allocationSize, &commandGroupHandlerRef) {}
+};
+
+} // namespace sycl
+} // namespace cl
+
+#undef SYCL_ACCESSOR_IMPL
+#undef SYCL_ACCESSOR_SUBCLASS
+
+//TODO hash for accessor
diff --git a/sycl/include/CL/sycl/atomic.hpp b/sycl/include/CL/sycl/atomic.hpp
new file mode 100644
index 000000000000..ab06af923330
--- /dev/null
+++ b/sycl/include/CL/sycl/atomic.hpp
@@ -0,0 +1,353 @@
+//==---------------- atomic.hpp - SYCL atomics -----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/access/access.hpp>
+#ifdef __SYCL_DEVICE_ONLY__
+#include <CL/__spirv/spirv_ops.hpp>
+#else
+#include <CL/__spirv/spirv_types.hpp>
+#include <atomic>
+#endif
+#include <type_traits>
+
+#define STATIC_ASSERT_NOT_FLOAT(T)                                             \
+  static_assert(!std::is_same<T, float>::value,                                \
+                "SYCL atomic function not available for float type")
+
+namespace cl {
+namespace sycl {
+
+enum class memory_order : int { relaxed };
+
+// Forward declaration
+template <typename pointerT, access::address_space addressSpace>
+class multi_ptr;
+
+namespace detail {
+
+using memory_order = cl::sycl::memory_order;
+
+template <typename T> struct IsValidAtomicType {
+  static constexpr bool value =
+      (std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
+       std::is_same<T, long>::value || std::is_same<T, unsigned long>::value ||
+       std::is_same<T, long long>::value ||
+       std::is_same<T, unsigned long long>::value ||
+       std::is_same<T, float>::value);
+};
+
+template <cl::sycl::access::address_space AS> struct IsValidAtomicAddressSpace {
+  static constexpr bool value = (AS == access::address_space::global_space ||
+                                 AS == access::address_space::local_space);
+};
+
+// Type trait to translate a cl::sycl::access::address_space to
+// a SPIR-V memory scope
+template <access::address_space AS> struct GetSpirvMemoryScope {};
+template <> struct GetSpirvMemoryScope<access::address_space::global_space> {
+  static constexpr auto scope = cl::__spirv::Scope::Device;
+};
+template <> struct GetSpirvMemoryScope<access::address_space::local_space> {
+  static constexpr auto scope = ::cl::__spirv::Scope::Workgroup;
+};
+
+// Translate the cl::sycl::memory_order to a SPIR-V builtin order
+static inline ::cl::__spirv::MemorySemantics
+getSpirvMemorySemantics(memory_order Order) {
+  return ::cl::__spirv::MemorySemantics::None;
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
+
+#ifndef __SYCL_DEVICE_ONLY__
+// host implementation of SYCL atomics
+namespace cl {
+namespace sycl {
+namespace detail {
+// Translate cl::sycl::memory_order or cl::__spirv::MemorySemantics
+// into std::memory_order
+// Only relaxed memory semantics are supported currently
+static inline std::memory_order
+getStdMemoryOrder(::cl::__spirv::MemorySemantics MS) {
+  return std::memory_order_relaxed;
+}
+static inline std::memory_order getStdMemoryOrder(::cl::sycl::memory_order MS) {
+  return std::memory_order_relaxed;
+}
+} // namespace detail
+} // namespace sycl
+
+// std::atomic version of atomic SPIR-V builtins
+namespace __spirv {
+
+template <typename T>
+void OpAtomicStore(std::atomic<T> *Ptr, Scope S, MemorySemantics MS, T V) {
+  Ptr->store(V, ::cl::sycl::detail::getStdMemoryOrder(MS));
+}
+
+template <typename T>
+T OpAtomicLoad(std::atomic<T> *Ptr, Scope S, MemorySemantics MS) {
+  return Ptr->load(::cl::sycl::detail::getStdMemoryOrder(MS));
+}
+
+template <typename T>
+T OpAtomicExchange(std::atomic<T>* Ptr, Scope S, MemorySemantics MS, T V) {
+  return Ptr->exchange(V, ::cl::sycl::detail::getStdMemoryOrder(MS));
+}
+
+template <typename T>
+extern T OpAtomicIAdd(std::atomic<T> *Ptr, Scope S, MemorySemantics MS, T V) {
+  return Ptr->fetch_add(V, ::cl::sycl::detail::getStdMemoryOrder(MS));
+}
+
+template <typename T>
+extern T OpAtomicISub(std::atomic<T> *Ptr, Scope S, MemorySemantics MS, T V) {
+  return Ptr->fetch_sub(V, ::cl::sycl::detail::getStdMemoryOrder(MS));
+}
+
+template <typename T>
+extern T OpAtomicAnd(std::atomic<T> *Ptr, Scope S, MemorySemantics MS, T V) {
+  return Ptr->fetch_and(V, ::cl::sycl::detail::getStdMemoryOrder(MS));
+}
+
+template <typename T>
+extern T OpAtomicOr(std::atomic<T> *Ptr, Scope S, MemorySemantics MS, T V) {
+  return Ptr->fetch_or(V, ::cl::sycl::detail::getStdMemoryOrder(MS));
+}
+
+template <typename T>
+extern T OpAtomicXor(std::atomic<T> *Ptr, Scope S, MemorySemantics MS, T V) {
+  return Ptr->fetch_xor(V, ::cl::sycl::detail::getStdMemoryOrder(MS));
+}
+
+template <typename T>
+extern T OpAtomicMin(std::atomic<T> *Ptr, Scope S, MemorySemantics MS, T V) {
+  std::memory_order MemoryOrder = ::cl::sycl::detail::getStdMemoryOrder(MS);
+  T Val = Ptr->load(MemoryOrder);
+  while (V < Val) {
+    if (Ptr->compare_exchange_strong(Val, V, MemoryOrder, MemoryOrder))
+      break;
+    Val = Ptr->load(MemoryOrder);
+  }
+  return Val;
+}
+
+template <typename T>
+extern T OpAtomicMax(std::atomic<T> *Ptr, Scope S, MemorySemantics MS, T V) {
+  std::memory_order MemoryOrder = ::cl::sycl::detail::getStdMemoryOrder(MS);
+  T Val = Ptr->load(MemoryOrder);
+  while (V > Val) {
+    if (Ptr->compare_exchange_strong(Val, V, MemoryOrder, MemoryOrder))
+      break;
+    Val = Ptr->load(MemoryOrder);
+  }
+  return Val;
+}
+
+} // namespace __spirv
+} // namespace cl
+#endif // !defined(__SYCL_DEVICE_ONLY__)
+
+namespace cl {
+namespace sycl {
+
+template <typename T, access::address_space addressSpace =
+                          access::address_space::global_space>
+class atomic {
+  static_assert(detail::IsValidAtomicType<T>::value,
+                "Invalid SYCL atomic type.  Valid types are: int, "
+                "unsigned int, long, unsigned long, long long,  unsigned "
+                "long long, float");
+  static_assert(detail::IsValidAtomicAddressSpace<addressSpace>::value,
+                "Invalid SYCL atomic address_space.  Valid address spaces are: "
+                "global_space, local_space");
+  static constexpr auto SpirvScope =
+      detail::GetSpirvMemoryScope<addressSpace>::scope;
+
+public:
+  template <typename pointerT>
+#ifdef __SYCL_DEVICE_ONLY__
+  atomic(multi_ptr<pointerT, addressSpace> ptr)
+      : Ptr(ptr.get())
+#else
+  atomic(multi_ptr<pointerT, addressSpace> ptr)
+      : Ptr(reinterpret_cast<std::atomic<T> *>(ptr.get()))
+#endif
+  {
+    static_assert(sizeof(T) == sizeof(pointerT),
+                  "T and pointerT must be same size");
+  }
+
+  void store(T Operand, memory_order Order = memory_order::relaxed) volatile {
+    ::cl::__spirv::OpAtomicStore(
+        Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand);
+  }
+
+  T load(memory_order Order = memory_order::relaxed) volatile {
+    return ::cl::__spirv::OpAtomicLoad(Ptr, SpirvScope,
+                                       detail::getSpirvMemorySemantics(Order));
+  }
+
+  T exchange(T Operand, memory_order Order = memory_order::relaxed) volatile {
+    return ::cl::__spirv::OpAtomicExchange(
+        Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand);
+  }
+
+  bool compare_exchange_strong(
+      T &Expected, T Desired, memory_order SuccessOrder = memory_order::relaxed,
+      memory_order FailOrder = memory_order::relaxed) volatile {
+    STATIC_ASSERT_NOT_FLOAT(T);
+#ifdef __SYCL_DEVICE_ONLY__
+    T Value = ::cl::__spirv::OpAtomicCompareExchange(
+        Ptr, SpirvScope, detail::getSpirvMemorySemantics(SuccessOrder),
+        detail::getSpirvMemorySemantics(FailOrder), Desired, Expected);
+    return (Value == Desired);
+#else
+    return Ptr->compare_exchange_strong(Expected, Desired,
+                                        detail::getStdMemoryOrder(SuccessOrder),
+                                        detail::getStdMemoryOrder(FailOrder));
+#endif
+  }
+
+  T fetch_add(T Operand, memory_order Order = memory_order::relaxed) volatile {
+    STATIC_ASSERT_NOT_FLOAT(T);
+    return ::cl::__spirv::OpAtomicIAdd(
+        Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand);
+  }
+
+  T fetch_sub(T Operand, memory_order Order = memory_order::relaxed) volatile {
+    STATIC_ASSERT_NOT_FLOAT(T);
+    return ::cl::__spirv::OpAtomicISub(
+        Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand);
+  }
+
+  T fetch_and(T Operand, memory_order Order = memory_order::relaxed) volatile {
+    STATIC_ASSERT_NOT_FLOAT(T);
+    return ::cl::__spirv::OpAtomicAnd(
+        Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand);
+  }
+
+  T fetch_or(T Operand, memory_order Order = memory_order::relaxed) volatile {
+    STATIC_ASSERT_NOT_FLOAT(T);
+    return ::cl::__spirv::OpAtomicOr(
+        Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand);
+  }
+
+  T fetch_xor(T Operand, memory_order Order = memory_order::relaxed) volatile {
+    STATIC_ASSERT_NOT_FLOAT(T);
+    return ::cl::__spirv::OpAtomicXor(
+        Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand);
+  }
+
+  T fetch_min(T Operand, memory_order Order = memory_order::relaxed) volatile {
+    STATIC_ASSERT_NOT_FLOAT(T);
+    return ::cl::__spirv::OpAtomicMin(
+        Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand);
+  }
+
+  T fetch_max(T Operand, memory_order Order = memory_order::relaxed) volatile {
+    STATIC_ASSERT_NOT_FLOAT(T);
+    return ::cl::__spirv::OpAtomicMax(
+        Ptr, SpirvScope, detail::getSpirvMemorySemantics(Order), Operand);
+  }
+
+private:
+#ifdef __SYCL_DEVICE_ONLY__
+  typename detail::PtrValueType<T, addressSpace>::type *Ptr;
+#else
+  std::atomic<T> *Ptr;
+#endif
+};
+
+template <typename T, access::address_space addressSpace>
+void atomic_store(atomic<T, addressSpace> Object, T Operand,
+                  memory_order MemoryOrder = memory_order::relaxed) {
+  Object.store(Operand, MemoryOrder);
+}
+
+template <typename T, access::address_space addressSpace>
+T atomic_load(atomic<T, addressSpace> Object,
+              memory_order MemoryOrder = memory_order::relaxed) {
+  return Object.load(MemoryOrder);
+}
+
+template <typename T, access::address_space addressSpace>
+T atomic_exchange(atomic<T, addressSpace> Object, T Operand,
+                  memory_order MemoryOrder = memory_order::relaxed) {
+  return Object.exchange(Operand, MemoryOrder);
+}
+
+// TODO: When CTS atomic tests are fixed remove this API
+template <typename T, access::address_space addressSpace>
+bool atomic_compare_exchange_strong(
+    atomic<T, addressSpace> Object, T *Expected, T Desired,
+    memory_order SuccessOrder = memory_order::relaxed,
+    memory_order FailOrder = memory_order::relaxed) {
+  return Object.compare_exchange_strong(*Expected, Desired, SuccessOrder,
+                                        FailOrder);
+}
+
+template <typename T, access::address_space addressSpace>
+bool atomic_compare_exchange_strong(
+    atomic<T, addressSpace> Object, T &Expected, T Desired,
+    memory_order SuccessOrder = memory_order::relaxed,
+    memory_order FailOrder = memory_order::relaxed) {
+  return Object.compare_exchange_strong(Expected, Desired, SuccessOrder,
+                                        FailOrder);
+}
+
+template <typename T, access::address_space addressSpace>
+T atomic_fetch_add(atomic<T, addressSpace> Object, T Operand,
+                   memory_order MemoryOrder = memory_order::relaxed) {
+  return Object.fetch_add(Operand, MemoryOrder);
+}
+
+template <typename T, access::address_space addressSpace>
+T atomic_fetch_sub(atomic<T, addressSpace> Object, T Operand,
+                   memory_order MemoryOrder = memory_order::relaxed) {
+  return Object.fetch_sub(Operand, MemoryOrder);
+}
+
+template <typename T, access::address_space addressSpace>
+T atomic_fetch_and(atomic<T, addressSpace> Object, T Operand,
+                   memory_order MemoryOrder = memory_order::relaxed) {
+  return Object.fetch_and(Operand, MemoryOrder);
+}
+
+template <typename T, access::address_space addressSpace>
+T atomic_fetch_or(atomic<T, addressSpace> Object, T Operand,
+                  memory_order MemoryOrder = memory_order::relaxed) {
+  return Object.fetch_or(Operand, MemoryOrder);
+}
+
+template <typename T, access::address_space addressSpace>
+T atomic_fetch_xor(atomic<T, addressSpace> Object, T Operand,
+                   memory_order MemoryOrder = memory_order::relaxed) {
+  return Object.fetch_xor(Operand, MemoryOrder);
+}
+
+template <typename T, access::address_space addressSpace>
+T atomic_fetch_min(atomic<T, addressSpace> Object, T Operand,
+                   memory_order MemoryOrder = memory_order::relaxed) {
+  return Object.fetch_min(Operand, MemoryOrder);
+}
+
+template <typename T, access::address_space addressSpace>
+T atomic_fetch_max(atomic<T, addressSpace> Object, T Operand,
+                   memory_order MemoryOrder = memory_order::relaxed) {
+  return Object.fetch_max(Operand, MemoryOrder);
+}
+
+} // namespace sycl
+} // namespace cl
+
+#undef STATIC_ASSERT_NOT_FLOAT
diff --git a/sycl/include/CL/sycl/buffer.hpp b/sycl/include/CL/sycl/buffer.hpp
new file mode 100644
index 000000000000..5cd1e5109ced
--- /dev/null
+++ b/sycl/include/CL/sycl/buffer.hpp
@@ -0,0 +1,199 @@
+//==----------- buffer.hpp --- SYCL buffer ---------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/detail/buffer_impl.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/exception.hpp>
+#include <CL/sycl/stl.hpp>
+
+// TODO: 4.3.4 Properties
+
+namespace cl {
+namespace sycl {
+class handler;
+class queue;
+template <int dimentions> class range;
+
+template <typename T, int dimensions = 1,
+          typename AllocatorT = cl::sycl::buffer_allocator<T>>
+class buffer {
+public:
+  using value_type = T;
+  using reference = value_type &;
+  using const_reference = const value_type &;
+  using allocator_type = AllocatorT;
+
+  buffer(const range<dimensions> &bufferRange,
+         const property_list &propList = {}) {
+    impl = std::make_shared<detail::buffer_impl<T, dimensions, AllocatorT>>(
+        bufferRange, propList);
+  }
+
+  // buffer(const range<dimensions> &bufferRange, AllocatorT allocator,
+  // const property_list &propList = {}) {
+  //     impl = std::make_shared<detail::buffer_impl>(bufferRange, allocator,
+  //     propList);
+  // }
+
+  buffer(T *hostData, const range<dimensions> &bufferRange,
+         const property_list &propList = {}) {
+    impl = std::make_shared<detail::buffer_impl<T, dimensions, AllocatorT>>(
+        hostData, bufferRange, propList);
+  }
+
+  // buffer(T *hostData, const range<dimensions> &bufferRange,
+  // AllocatorT allocator, const property_list &propList = {}) {
+  //     impl = std::make_shared<detail::buffer_impl>(hostData, bufferRange,
+  //     allocator, propList);
+  // }
+
+  buffer(const T *hostData, const range<dimensions> &bufferRange,
+         const property_list &propList = {}) {
+    impl = std::make_shared<detail::buffer_impl<T, dimensions, AllocatorT>>(
+        hostData, bufferRange, propList);
+  }
+
+  // buffer(const T *hostData, const range<dimensions> &bufferRange,
+  // AllocatorT allocator, const property_list &propList = {}) {
+  //     impl = std::make_shared<detail::buffer_impl>(hostData, bufferRange,
+  //     allocator, propList);
+  // }
+
+  // buffer(const shared_ptr_class<T> &hostData,
+  // const range<dimensions> &bufferRange, AllocatorT allocator,
+  // const property_list &propList = {}) {
+  //     impl = std::make_shared<detail::buffer_impl>(hostData, bufferRange,
+  //     allocator, propList);
+  // }
+
+  buffer(const shared_ptr_class<T> &hostData,
+         const range<dimensions> &bufferRange,
+         const property_list &propList = {}) {
+    impl = std::make_shared<detail::buffer_impl<T, dimensions, AllocatorT>>(
+        hostData, bufferRange, propList);
+  }
+
+  // template <class InputIterator>
+  // buffer<T, 1>(InputIterator first, InputIterator last, AllocatorT allocator,
+  // const property_list &propList = {}) {
+  //     impl = std::make_shared<detail::buffer_impl>(first, last, allocator,
+  //     propList);
+  // }
+
+  template <class InputIterator, int N = dimensions,
+            typename = std::enable_if<N == 1>>
+  buffer(InputIterator first, InputIterator last,
+         const property_list &propList = {}) {
+    impl = std::make_shared<detail::buffer_impl<T, dimensions, AllocatorT>>(
+        first, last, propList);
+  }
+
+  // buffer(buffer<T, dimensions, AllocatorT> b, const id<dimensions>
+  // &baseIndex, const range<dimensions> &subRange) {
+  //     impl = std::make_shared<detail::buffer_impl>(b, baseIndex, subRange);
+  // }
+
+  template <int N = dimensions, typename = std::enable_if<N == 1>>
+  buffer(cl_mem MemObject, const context &SyclContext,
+         event AvailableEvent = {}) {
+    impl = std::make_shared<detail::buffer_impl<T, dimensions, AllocatorT>>(
+        MemObject, SyclContext, AvailableEvent);
+  }
+
+  buffer(const buffer &rhs) = default;
+
+  buffer(buffer &&rhs) = default;
+
+  buffer &operator=(const buffer &rhs) = default;
+
+  buffer &operator=(buffer &&rhs) = default;
+
+  ~buffer() = default;
+
+  bool operator==(const buffer &rhs) const { return impl == rhs.impl; }
+
+  bool operator!=(const buffer &rhs) const { return !(*this == rhs); }
+
+  /* -- common interface members -- */
+
+  /* -- property interface members -- */
+
+  range<dimensions> get_range() const { return impl->get_range(); }
+
+  size_t get_count() const { return impl->get_count(); }
+
+  size_t get_size() const { return impl->get_size(); }
+
+  AllocatorT get_allocator() const { return impl->get_allocator(); }
+
+  template <access::mode mode,
+            access::target target = access::target::global_buffer>
+  accessor<T, dimensions, mode, target, access::placeholder::false_t>
+  get_access(handler &commandGroupHandler) {
+    return impl->template get_access<mode, target>(*this, commandGroupHandler);
+  }
+
+  template <access::mode mode>
+  accessor<T, dimensions, mode, access::target::host_buffer,
+           access::placeholder::false_t>
+  get_access() {
+    return impl->template get_access<mode>(*this);
+  }
+
+  // template <access::mode mode, access::target target =
+  // access::target::global_buffer> accessor<T, dimensions, mode, target,
+  // access::placeholder::false_t> get_access( handler &commandGroupHandler,
+  // range<dimensions> accessRange, id<dimensions> accessOffset = {}) {
+  //     return impl->get_access(commandGroupHandler, accessRange,
+  //     accessOffset);
+  // }
+
+  // template <access::mode mode>
+  // accessor<T, dimensions, mode, access::target::host_buffer,
+  // access::placeholder::false_t> get_access( range<dimensions> accessRange,
+  // id<dimensions> accessOffset = {}) {
+  //     return impl->get_access(accessRange, accessOffset);
+  // }
+
+  template <typename Destination = std::nullptr_t>
+  void set_final_data(Destination finalData = nullptr) {
+    impl->set_final_data(finalData);
+  }
+
+  // void set_write_back(bool flag = true) { return impl->set_write_back(flag);
+  // }
+
+  // bool is_sub_buffer() const { return impl->is_sub_buffer(); }
+
+  // template <typename ReinterpretT, int ReinterpretDim>
+  // buffer<ReinterpretT, ReinterpretDim, AllocatorT>
+  // reinterpret(range<ReinterpretDim> reinterpretRange) const {
+  //     return impl->reinterpret((reinterpretRange));
+  // }
+
+private:
+  shared_ptr_class<detail::buffer_impl<T, dimensions, AllocatorT>> impl;
+  template <class Obj>
+  friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject);
+};
+} // namespace sycl
+} // namespace cl
+
+namespace std {
+template <typename T, int dimensions, typename AllocatorT>
+struct hash<cl::sycl::buffer<T, dimensions, AllocatorT>> {
+  size_t
+  operator()(const cl::sycl::buffer<T, dimensions, AllocatorT> &b) const {
+    return hash<std::shared_ptr<
+        cl::sycl::detail::buffer_impl<T, dimensions, AllocatorT>>>()(
+        cl::sycl::detail::getSyclObjImpl(b));
+  }
+};
+} // namespace std
diff --git a/sycl/include/CL/sycl/context.hpp b/sycl/include/CL/sycl/context.hpp
new file mode 100644
index 000000000000..a53933042a83
--- /dev/null
+++ b/sycl/include/CL/sycl/context.hpp
@@ -0,0 +1,80 @@
+//==---------------- context.hpp - SYCL context ----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/detail/context_host.hpp>
+#include <CL/sycl/detail/context_opencl.hpp>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/platform.hpp>
+#include <CL/sycl/stl.hpp>
+#include <memory>
+#include <utility>
+// 4.6.2 Context class
+
+namespace cl {
+namespace sycl {
+class context {
+public:
+  explicit context(const async_handler &asyncHandler = {})
+      : context(default_selector().select_device(), asyncHandler) {}
+
+  context(const device &dev, async_handler asyncHandler = {})
+      : context(vector_class<device>(1, dev), asyncHandler) {}
+
+  context(const platform &plt, async_handler asyncHandler = {})
+      : context(plt.get_devices(), asyncHandler) {}
+
+  context(const vector_class<device> &deviceList,
+          async_handler asyncHandler = {});
+
+  context(cl_context clContext, async_handler asyncHandler = {});
+
+  template <info::context param>
+  typename info::param_traits<info::context, param>::return_type
+  get_info() const {
+    return impl->get_info<param>();
+  }
+
+  context(const context &rhs) = default;
+
+  context(context &&rhs) = default;
+
+  context &operator=(const context &rhs) = default;
+
+  context &operator=(context &&rhs) = default;
+
+  bool operator==(const context &rhs) const { return impl == rhs.impl; }
+
+  bool operator!=(const context &rhs) const { return !(*this == rhs); }
+
+  cl_context get() const { return impl->get(); }
+
+  bool is_host() const { return impl->is_host(); }
+
+  platform get_platform() const { return impl->get_platform(); }
+
+  vector_class<device> get_devices() const { return impl->get_devices(); }
+
+private:
+  std::shared_ptr<detail::context_impl> impl;
+  template <class T>
+  friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject);
+};
+
+} // namespace sycl
+} // namespace cl
+
+namespace std {
+template <> struct hash<cl::sycl::context> {
+  size_t operator()(const cl::sycl::context &c) const {
+    return hash<std::shared_ptr<cl::sycl::detail::context_impl>>()(
+        cl::sycl::detail::getSyclObjImpl(c));
+  }
+};
+} // namespace std
diff --git a/sycl/include/CL/sycl/detail/array.hpp b/sycl/include/CL/sycl/detail/array.hpp
new file mode 100644
index 000000000000..b8cf259dd0bd
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/array.hpp
@@ -0,0 +1,117 @@
+//==-------- array.hpp --- SYCL common iteration object ---------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/exception.hpp>
+#include <functional>
+#include <stdexcept>
+#include <type_traits>
+
+namespace cl {
+namespace sycl {
+template <int dimensions> struct id;
+template <int dimensions> class range;
+namespace detail {
+
+template <int dimensions = 1> class array {
+public:
+  INLINE_IF_DEVICE array() : common_array{0} {}
+
+  /* The following constructor is only available in the array struct
+   * specialization where: dimensions==1 */
+  template <int N = dimensions> INLINE_IF_DEVICE
+  array(typename std::enable_if<(N == 1), size_t>::type dim0)
+      : common_array{dim0} {}
+
+  /* The following constructor is only available in the array struct
+   * specialization where: dimensions==2 */
+  template <int N = dimensions> INLINE_IF_DEVICE
+  array(typename std::enable_if<(N == 2), size_t>::type dim0, size_t dim1)
+      : common_array{dim0, dim1} {}
+
+  /* The following constructor is only available in the array struct
+   * specialization where: dimensions==3 */
+  template <int N = dimensions> INLINE_IF_DEVICE
+  array(typename std::enable_if<(N == 3), size_t>::type dim0, size_t dim1,
+        size_t dim2)
+      : common_array{dim0, dim1, dim2} {}
+
+  // Conversion operators to derived classes
+  INLINE_IF_DEVICE operator cl::sycl::id<dimensions>() const {
+    cl::sycl::id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result[i] = common_array[i];
+    }
+    return result;
+  }
+
+  INLINE_IF_DEVICE operator cl::sycl::range<dimensions>() const {
+    cl::sycl::range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result[i] = common_array[i];
+    }
+    return result;
+  }
+
+  INLINE_IF_DEVICE size_t get(int dimension) const {
+    check_dimension(dimension);
+    return common_array[dimension];
+  }
+
+  INLINE_IF_DEVICE size_t &operator[](int dimension) {
+    check_dimension(dimension);
+    return common_array[dimension];
+  }
+
+  INLINE_IF_DEVICE size_t operator[](int dimension) const {
+    check_dimension(dimension);
+    return common_array[dimension];
+  }
+
+  INLINE_IF_DEVICE array(const array<dimensions> &rhs) = default;
+  INLINE_IF_DEVICE array(array<dimensions> &&rhs) = default;
+  INLINE_IF_DEVICE array<dimensions> &operator=(const array<dimensions> &rhs) = default;
+  INLINE_IF_DEVICE array<dimensions> &operator=(array<dimensions> &&rhs) = default;
+
+  // Returns true iff all elements in 'this' are equal to
+  // the corresponding elements in 'rhs'.
+  INLINE_IF_DEVICE bool operator==(const array<dimensions> &rhs) const {
+    for (int i = 0; i < dimensions; ++i) {
+      if (this->common_array[i] != rhs.common_array[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Returns true iff there is at least one element in 'this'
+  // which is not equal to the corresponding element in 'rhs'.
+  INLINE_IF_DEVICE bool operator!=(const array<dimensions> &rhs) const {
+    for (int i = 0; i < dimensions; ++i) {
+      if (this->common_array[i] != rhs.common_array[i]) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+protected:
+  size_t common_array[dimensions];
+  ALWAYS_INLINE void check_dimension(int dimension) const {
+#ifndef __SYCL_DEVICE_ONLY__
+    if (dimension >= dimensions || dimension < 0) {
+      throw cl::sycl::invalid_parameter_error("Index out of range");
+    }
+#endif
+  }
+};
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/buffer_impl.hpp b/sycl/include/CL/sycl/detail/buffer_impl.hpp
new file mode 100644
index 000000000000..f763c06fe4c9
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/buffer_impl.hpp
@@ -0,0 +1,516 @@
+//==---------- buffer_impl.hpp --- SYCL buffer ----------------*- C++-*---==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/helpers.hpp>
+#include <CL/sycl/detail/queue_impl.hpp>
+#include <CL/sycl/detail/scheduler/requirements.h>
+#include <CL/sycl/detail/scheduler/scheduler.h>
+#include <CL/sycl/handler.hpp>
+#include <CL/sycl/property_list.hpp>
+#include <CL/sycl/stl.hpp>
+#include <CL/sycl/types.hpp>
+
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+namespace cl {
+namespace sycl {
+using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
+using EventImplPtr = std::shared_ptr<cl::sycl::detail::event_impl>;
+// Forward declarations
+template <typename dataT, int dimensions, access::mode accessmode,
+          access::target accessTarget, access::placeholder isPlaceholder>
+class accessor;
+template <typename T, int dimensions, typename AllocatorT> class buffer;
+class handler;
+class queue;
+template <int dimentions> class id;
+template <int dimentions> class range;
+template <class T> using buffer_allocator = std::allocator<T>;
+namespace detail {
+template <typename T, int dimensions = 1,
+          typename AllocatorT = cl::sycl::buffer_allocator<T>>
+class buffer_impl {
+public:
+  buffer_impl(const range<dimensions> &bufferRange,
+              const property_list &propList = {})
+      : buffer_impl((T *)nullptr, bufferRange, propList) {}
+
+  buffer_impl(T *hostData, const range<dimensions> &bufferRange,
+              const property_list &propList = {})
+      : Range(bufferRange), Props(propList) {
+    if (Props.has_property<property::buffer::use_host_ptr>()) {
+      BufPtr = hostData;
+    } else {
+      BufData.resize(get_size());
+      BufPtr = reinterpret_cast<T *>(BufData.data());
+      if (hostData != nullptr) {
+        set_final_data(hostData);
+        std::copy(hostData, hostData + get_count(), BufPtr);
+      }
+    }
+  }
+
+  // TODO temporary solution for allowing initialisation with const data
+  buffer_impl(const T *hostData, const range<dimensions> &bufferRange,
+              const property_list &propList = {})
+      : Range(bufferRange), Props(propList) {
+    if (Props.has_property<property::buffer::use_host_ptr>()) {
+      // TODO make this buffer read only
+      BufPtr = const_cast<T *>(hostData);
+    } else {
+      BufData.resize(get_size());
+      BufPtr = reinterpret_cast<T *>(BufData.data());
+      if (hostData != nullptr) {
+        std::copy(hostData, hostData + get_count(), BufPtr);
+      }
+    }
+  }
+
+  buffer_impl(const shared_ptr_class<T> &hostData,
+              const range<dimensions> &bufferRange,
+              const property_list &propList = {})
+      : Range(bufferRange), Props(propList) {
+    if (Props.has_property<property::buffer::use_host_ptr>()) {
+      BufPtr = hostData.get();
+    } else {
+      BufData.resize(get_size());
+      BufPtr = reinterpret_cast<T *>(BufData.data());
+      if (hostData.get() != nullptr) {
+        weak_ptr_class<T> hostDataWeak = hostData;
+        set_final_data(hostDataWeak);
+        std::copy(hostData.get(), hostData.get() + get_count(), BufPtr);
+      }
+    }
+  }
+
+  template <class InputIterator, int N = dimensions,
+            typename = std::enable_if<N == 1>>
+  buffer_impl(InputIterator first, InputIterator last,
+              const property_list &propList = {})
+      : Range(range<1>(std::distance(first, last))), Props(propList) {
+    if (Props.has_property<property::buffer::use_host_ptr>()) {
+      BufPtr = &*first;
+    } else {
+      BufData.resize(get_size());
+      BufPtr = reinterpret_cast<T *>(BufData.data());
+      std::copy(first, last, BufPtr);
+    }
+  }
+
+  template <int N = dimensions, typename = std::enable_if<N == 1>>
+  buffer_impl(cl_mem MemObject, const context &SyclContext,
+              event AvailableEvent = {})
+      : OpenCLInterop(true), AvailableEvent(AvailableEvent) {
+    if (SyclContext.is_host())
+      throw cl::sycl::invalid_parameter_error(
+          "Creation of interoperability buffer using host context is not "
+          "allowed");
+
+    CHECK_OCL_CODE(clGetMemObjectInfo(MemObject, CL_MEM_CONTEXT,
+                                      sizeof(OpenCLContext), &OpenCLContext, nullptr));
+    if (SyclContext.get() != OpenCLContext)
+      throw cl::sycl::invalid_parameter_error(
+          "Input context must be the same as the context of cl_mem");
+    OCLState.Mem = MemObject;
+    CHECK_OCL_CODE(clRetainMemObject(MemObject));
+  }
+
+  range<dimensions> get_range() const { return Range; }
+
+  size_t get_count() const { return Range.size(); }
+
+  size_t get_size() const { return get_count() * sizeof(T); }
+
+  ~buffer_impl() {
+    if (!OpenCLInterop)
+      // TODO. Use node instead?
+      simple_scheduler::Scheduler::getInstance()
+          .copyBack<access::mode::read_write, access::target::host_buffer>(
+              *this);
+
+    if (uploadData != nullptr) {
+      uploadData();
+    }
+
+    // TODO. Use node instead?
+    simple_scheduler::Scheduler::getInstance().removeBuffer(*this);
+
+    if (OpenCLInterop)
+      CHECK_OCL_CODE_NO_EXC(clReleaseMemObject(OCLState.Mem));
+  }
+
+  void set_final_data(std::nullptr_t) { uploadData = nullptr; }
+
+  void set_final_data(weak_ptr_class<T> final_data) {
+    if (OpenCLInterop)
+      throw cl::sycl::runtime_error(
+          "set_final_data could not be used with interoperability buffer");
+    uploadData = [this, final_data]() {
+      if (auto finalData = final_data.lock()) {
+        std::copy(BufPtr, BufPtr + get_count(), finalData.get());
+      }
+    };
+  }
+
+  template <typename Destination> void set_final_data(Destination final_data) {
+    if (OpenCLInterop)
+      throw cl::sycl::runtime_error(
+          "set_final_data could not be used with interoperability buffer");
+    static_assert(!std::is_const<Destination>::value,
+                  "Сan not write in a constant Destination. Destination should "
+                  "not be const.");
+    uploadData = [this, final_data]() mutable {
+      std::copy(BufPtr, BufPtr + get_count(), final_data);
+    };
+  }
+
+  template <access::mode mode,
+            access::target target = access::target::global_buffer>
+  accessor<T, dimensions, mode, target, access::placeholder::false_t>
+  get_access(buffer<T, dimensions, AllocatorT> &Buffer,
+             handler &commandGroupHandler) {
+    return accessor<T, dimensions, mode, target, access::placeholder::false_t>(
+        Buffer, commandGroupHandler);
+  }
+
+  template <access::mode mode>
+  accessor<T, dimensions, mode, access::target::host_buffer,
+           access::placeholder::false_t>
+  get_access(buffer<T, dimensions, AllocatorT> &Buffer) {
+    return accessor<T, dimensions, mode, access::target::host_buffer,
+                    access::placeholder::false_t>(Buffer);
+  }
+
+public:
+  void moveMemoryTo(QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+                    EventImplPtr Event);
+
+  void fill(QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+            EventImplPtr Event, const void *Pattern, size_t PatternSize,
+            int Dim, size_t *Offset, size_t *Range);
+
+  void copy(QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+            EventImplPtr Event, simple_scheduler::BufferReqPtr SrcReq,
+            const int DimSrc, const size_t *const SrcRange,
+            const size_t *const SrcOffset, const size_t *const DestOffset,
+            const size_t SizeTySrc, const size_t SizeSrc,
+            const size_t *const BuffSrcRange);
+
+  size_t convertSycl2OCLMode(cl::sycl::access::mode mode);
+
+  bool isValidAccessToMem(cl::sycl::access::mode AccessMode);
+
+  void allocate(QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+                EventImplPtr Event, cl::sycl::access::mode mode);
+
+  cl_mem getOpenCLMem() const;
+
+private:
+  // There are internal structures in this section.
+  enum DeviceMemoryState {
+    DMS_NULL,     // No data were transferred between host and device.
+    DMS_COPIED,   // Data were copied from host to device.
+    DMS_MODIFIED, // Data in device memory were modified.
+    DMS_HOST      // Use host pointer for device memory
+  };
+  // Contains the latest virtual state of buffer during commands enqueueing.
+  // TODO: Need to find better solution, at least make state for each device.
+  struct OpenCLMemState {
+    QueueImplPtr Queue;
+    cl_mem Mem = nullptr;
+  };
+
+private:
+  // This field must be the first to guarantee that it's safe to use
+  // reinterpret casting while setting kernel arguments in order to get cl_mem
+  // value from the buffer regardless of its dimensionality.
+  OpenCLMemState OCLState;
+  bool OpenCLInterop = false;
+  event AvailableEvent;
+  cl_context OpenCLContext = nullptr;
+  T *BufPtr = nullptr;
+  vector_class<byte> BufData;
+  // TODO: enable support of cl_mem objects from multiple contexts
+  // TODO: at the current moment, using a buffer on multiple devices
+  // or on a device and a host simultaneously is not supported (the
+  // implementation is incorrect).
+  range<dimensions> Range;
+  property_list Props;
+  std::function<void(void)> uploadData = nullptr;
+  template <typename DataT, int Dimensions, access::mode AccessMode,
+            access::target AccessTarget, access::placeholder IsPlaceholder>
+  friend class cl::sycl::accessor;
+};
+
+template <typename T, int dimensions, typename AllocatorT>
+void buffer_impl<T, dimensions, AllocatorT>::fill(
+    QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+    EventImplPtr Event, const void *Pattern, size_t PatternSize, int Dim,
+    size_t *OffsetArr, size_t *RangeArr) {
+
+  assert(dimensions == 1 &&
+         "OpenCL doesn't support multidimensional fill method.");
+  assert(!Queue->is_host() && "Host case is handled in other place.");
+
+  size_t Offset = OffsetArr[0];
+  size_t Size = RangeArr[0] * PatternSize;
+
+  cl::sycl::context Context = Queue->get_context();
+
+  OCLState.Queue = std::move(Queue);
+  Event->setIsHostEvent(false);
+
+  cl_event &BufEvent = Event->getHandleRef();
+  std::vector<cl_event> CLEvents =
+      detail::getOrWaitEvents(std::move(DepEvents), Context);
+
+  cl_command_queue CommandQueue = OCLState.Queue->get();
+  cl_int Error = clEnqueueFillBuffer(
+      CommandQueue, OCLState.Mem, Pattern, PatternSize, Offset, Size,
+      CLEvents.size(), CLEvents.data(), &BufEvent);
+
+  CHECK_OCL_CODE(Error);
+  CHECK_OCL_CODE(clReleaseCommandQueue(CommandQueue));
+}
+
+template <typename T, int dimensions, typename AllocatorT>
+void buffer_impl<T, dimensions, AllocatorT>::copy(
+    QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+    EventImplPtr Event, simple_scheduler::BufferReqPtr SrcReq, const int DimSrc,
+    const size_t *const SrcRange, const size_t *const SrcOffset,
+    const size_t *const DestOffset, const size_t SizeTySrc,
+    const size_t SizeSrc, const size_t *const BuffSrcRange) {
+  assert(!Queue->is_host() && "Host case is handled in other place.");
+
+  size_t *BuffDestRange = &get_range()[0];
+  size_t SizeTyDest = sizeof(T);
+  const int DimDest = dimensions;
+
+  cl::sycl::context Context = Queue->get_context();
+
+  cl_event &BufEvent = Event->getHandleRef();
+  std::vector<cl_event> CLEvents =
+      detail::getOrWaitEvents(std::move(DepEvents), Context);
+  cl_int Error;
+
+  cl_command_queue CommandQueue = Queue->get();
+  if (1 == DimSrc && 1 == DimDest) {
+    Error = clEnqueueCopyBuffer(CommandQueue, SrcReq->getCLMemObject(),
+                                OCLState.Mem, SrcOffset[0], DestOffset[0],
+                                SizeSrc * SizeTySrc, CLEvents.size(),
+                                CLEvents.data(), &BufEvent);
+  } else {
+    size_t SrcOrigin[3] = {SrcOffset[0] * SizeTySrc,
+                            (1 == DimSrc) ? 0 : SrcOffset[1],
+                            (3 == DimSrc) ? SrcOffset[2] : 0};
+    size_t DstOrigin[3] = {DestOffset[0] * SizeTyDest,
+                            (1 == DimDest) ? 0 : DestOffset[1],
+                            (3 == DimDest) ? DestOffset[2] : 0};
+    size_t Region[3] = {SrcRange[0] * SizeTySrc,
+                        (1 == DimSrc) ? 1 : SrcRange[1],
+                        (3 == DimSrc) ? SrcRange[2] : 1};
+    size_t SrcRowPitch = (1 == DimSrc) ? 0 : SizeTySrc * BuffSrcRange[0];
+    size_t SrcSlicePitch =
+        (3 == DimSrc) ? SizeTySrc * BuffSrcRange[0] * BuffSrcRange[1] : 0;
+    size_t DstRowPitch = (1 == DimSrc) ? 0 : SizeTyDest * BuffDestRange[0];
+    size_t DstSlicePitch =
+        (3 == DimSrc) ? SizeTyDest * BuffDestRange[0] * BuffDestRange[1] : 0;
+
+    Error = clEnqueueCopyBufferRect(
+        CommandQueue, SrcReq->getCLMemObject(), OCLState.Mem, SrcOrigin,
+        DstOrigin, Region, SrcRowPitch, SrcSlicePitch, DstRowPitch,
+        DstSlicePitch, CLEvents.size(), CLEvents.data(), &BufEvent);
+  }
+  CHECK_OCL_CODE(Error);
+  CHECK_OCL_CODE(clReleaseCommandQueue(CommandQueue));
+  OCLState.Queue = std::move(Queue);
+  Event->setIsHostEvent(false);
+}
+
+template <typename T, int dimensions, typename AllocatorT>
+void buffer_impl<T, dimensions, AllocatorT>::moveMemoryTo(
+    QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+    EventImplPtr Event) {
+
+  cl::sycl::context Context = Queue->get_context();
+
+  if (OpenCLInterop && (Context.get() != OpenCLContext))
+    throw cl::sycl::runtime_error(
+        "Interoperability buffer could not be used in a context other than the "
+        "context associated with the OpenCL memory object.");
+
+  // TODO: Move all implementation specific commands to separate file?
+  // TODO: Make allocation in separate command?
+
+  // Special case, move to "user host"
+  // TODO: Check discuss if "user host" and "host device" are the same.
+  if ((Queue->is_host()) && (OCLState.Queue->is_host())) {
+    detail::waitEvents(DepEvents);
+    Event->setIsHostEvent(true);
+    OCLState.Queue = std::move(Queue);
+    return;
+  }
+
+  assert(OCLState.Queue->get_context() != Context ||
+         OCLState.Queue->get_device() != Queue->get_device() &&
+             "Attempt to move to the same env");
+
+  // Copy from OCL device to host device.
+  if (!OCLState.Queue->is_host() && Queue->is_host()) {
+    const size_t ByteSize = get_size();
+
+    std::vector<cl_event> CLEvents =
+        detail::getOrWaitEvents(std::move(DepEvents), Context);
+
+    // TODO: Handle different situations with host PTR.
+    // Enqueue copying from OCL buffer to host.
+    cl_event &ReadBufEvent = Event->getHandleRef();
+    cl_int Error = clEnqueueReadBuffer(
+        OCLState.Queue->getHandleRef(), OCLState.Mem,
+        /*blocking_read=*/CL_FALSE, /*offset=*/0, ByteSize, BufPtr,
+        CLEvents.size(), CLEvents.data(), &ReadBufEvent);
+    CHECK_OCL_CODE(Error);
+
+    Event->setIsHostEvent(false);
+
+    OCLState.Queue = std::move(Queue);
+    OCLState.Mem = nullptr;
+    return;
+  }
+  // Copy from host to OCL device.
+  if (OCLState.Queue->is_host() && !Queue->is_host()) {
+    const size_t ByteSize = get_size();
+    cl_int Error;
+    cl_mem Mem = clCreateBuffer(Context.get(), CL_MEM_READ_WRITE, ByteSize,
+                                /*host_ptr=*/nullptr, &Error);
+    CHECK_OCL_CODE(Error);
+
+    OCLState.Queue = std::move(Queue);
+    OCLState.Mem = Mem;
+
+    // Just exit if nothing to read from host.
+    if (nullptr == BufPtr) {
+      return;
+    }
+    std::vector<cl_event> CLEvents =
+        detail::getOrWaitEvents(std::move(DepEvents), Context);
+    cl_event &WriteBufEvent = Event->getHandleRef();
+    // Enqueue copying from host to new OCL buffer.
+    Error =
+        clEnqueueWriteBuffer(OCLState.Queue->getHandleRef(), Mem,
+                             /*blocking_write=*/CL_FALSE, /*offset=*/0,
+                             ByteSize, BufPtr, CLEvents.size(), CLEvents.data(),
+                             &WriteBufEvent); // replace &WriteBufEvent to NULL
+    CHECK_OCL_CODE(Error);
+    Event->setIsHostEvent(false);
+
+    return;
+  }
+
+  assert(0 && "Not handled");
+}
+
+template <typename T, int dimensions, typename AllocatorT>
+size_t buffer_impl<T, dimensions, AllocatorT>::convertSycl2OCLMode(
+    cl::sycl::access::mode mode) {
+  switch (mode) {
+  case cl::sycl::access::mode::read:
+    return CL_MEM_READ_ONLY;
+  case cl::sycl::access::mode::write:
+    return CL_MEM_WRITE_ONLY;
+  case cl::sycl::access::mode::read_write:
+  case cl::sycl::access::mode::atomic:
+    return CL_MEM_READ_WRITE;
+  default:
+    assert(0 && "Unhandled conversion from Sycl access mode to OCL one.");
+    return 0;
+  }
+}
+
+template <typename T, int dimensions, typename AllocatorT>
+bool buffer_impl<T, dimensions, AllocatorT>::isValidAccessToMem(
+    cl::sycl::access::mode AccessMode) {
+  cl_mem_flags Flags;
+  assert(OCLState.Mem != nullptr &&
+         "OpenCL memory associated with the buffer is null");
+  CHECK_OCL_CODE(clGetMemObjectInfo(OCLState.Mem, CL_MEM_FLAGS, sizeof(Flags),
+                                    &Flags, nullptr));
+  if (((Flags & CL_MEM_READ_WRITE) == 0) &&
+      ((convertSycl2OCLMode(AccessMode) & Flags) == 0))
+    return false;
+  return true;
+}
+
+template <typename T, int dimensions, typename AllocatorT>
+void buffer_impl<T, dimensions, AllocatorT>::allocate(
+    QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+    EventImplPtr Event, cl::sycl::access::mode mode) {
+
+  detail::waitEvents(DepEvents);
+
+  cl::sycl::context Context = Queue->get_context();
+
+  if (OpenCLInterop && (Context.get() != OpenCLContext))
+    throw cl::sycl::runtime_error(
+        "Interoperability buffer could not be used in a context other than the "
+        "context associated with the OpenCL memory object.");
+
+  if (OpenCLInterop) {
+    AvailableEvent.wait();
+    OCLState.Queue = std::move(Queue);
+    Event->setIsHostEvent(true);
+    return;
+  }
+
+  if (!Queue->is_host()) {
+    size_t ByteSize = get_size();
+    cl_int Error;
+
+    cl_mem Mem = clCreateBuffer(Context.get(), convertSycl2OCLMode(mode),
+                                ByteSize, nullptr, &Error);
+    CHECK_OCL_CODE(Error);
+
+    cl_event &WriteBufEvent = Event->getHandleRef();
+    Error = clEnqueueWriteBuffer(Queue->getHandleRef(), Mem,
+                                 /*blocking_write=*/CL_FALSE, /*offset=*/0,
+                                 ByteSize, BufPtr, /*num_of_events=*/0,
+                                 /*dep_list=*/nullptr, &WriteBufEvent);
+    CHECK_OCL_CODE(Error);
+
+    OCLState.Queue = std::move(Queue);
+    OCLState.Mem = Mem;
+
+    Event->setIsHostEvent(false);
+
+    return;
+  }
+  if (Queue->is_host()) {
+    Event->setIsHostEvent(true);
+    OCLState.Queue = std::move(Queue);
+    return;
+  }
+  assert(0 && "Unhandled Alloca");
+}
+
+template <typename T, int dimensions, typename AllocatorT>
+cl_mem buffer_impl<T, dimensions, AllocatorT>::getOpenCLMem() const {
+  assert(nullptr != OCLState.Mem);
+  return OCLState.Mem;
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/common.hpp b/sycl/include/CL/sycl/detail/common.hpp
new file mode 100644
index 000000000000..7241f7fe1211
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/common.hpp
@@ -0,0 +1,118 @@
+//==---------- common.hpp ----- Common declarations ------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+// Suppress a compiler warning about undefined CL_TARGET_OPENCL_VERSION
+// Khronos ICD supports only latest OpenCL version
+#define CL_TARGET_OPENCL_VERSION 220
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#include <CL/cl_ext_intel.h>
+#include <string>
+
+const char *stringifyErrorCode(cl_int error);
+
+#define OCL_CODE_TO_STR(code)                                                  \
+  std::string(std::to_string(code) + " (" + stringifyErrorCode(code) + ")")
+
+#define STRINGIFY_LINE_HELP(s) #s
+#define STRINGIFY_LINE(s) STRINGIFY_LINE_HELP(s)
+
+#define OCL_ERROR_REPORT                                                       \
+  "OpenCL API failed. " __FILE__                                               \
+  ":" STRINGIFY_LINE(__LINE__) ": "                                            \
+                               "OpenCL API returns: "
+
+#ifndef SYCL_SUPPRESS_OCL_ERROR_REPORT
+#include <iostream>
+#define REPORT_OCL_ERR_TO_STREAM(code)                                         \
+  if (code != CL_SUCCESS) {                                                    \
+    std::cerr << OCL_ERROR_REPORT << OCL_CODE_TO_STR(code) << std::endl;       \
+  }
+#endif
+
+#ifndef SYCL_SUPPRESS_EXCEPTIONS
+#include <CL/sycl/exception.hpp>
+
+#define REPORT_OCL_ERR_TO_EXC(code, exc)                                       \
+  if (code != CL_SUCCESS) {                                                    \
+    std::string errorMessage(OCL_ERROR_REPORT + OCL_CODE_TO_STR(code));        \
+    std::cerr << errorMessage << std::endl;                                    \
+    throw exc(errorMessage.c_str(), (code));                                   \
+  }
+#define REPORT_OCL_ERR_TO_EXC_THROW(code, exc) REPORT_OCL_ERR_TO_EXC(code, exc)
+#define REPORT_OCL_ERR_TO_EXC_BASE(code)                                       \
+  REPORT_OCL_ERR_TO_EXC(code, cl::sycl::runtime_error)
+#else
+#define REPORT_OCL_ERR_TO_EXC_BASE(code) REPORT_OCL_ERR_TO_STREAM(code)
+#endif
+
+#ifdef SYCL_SUPPRESS_OCL_ERROR_REPORT
+#define CHECK_OCL_CODE(X) (void)(X)
+#define CHECK_OCL_CODE_THROW(X, EXC) (void)(X)
+#define CHECK_OCL_CODE_NO_EXC(X) (void)(X)
+#else
+#define CHECK_OCL_CODE(X) REPORT_OCL_ERR_TO_EXC_BASE(X)
+#define CHECK_OCL_CODE_THROW(X, EXC) REPORT_OCL_ERR_TO_EXC_THROW(X, EXC)
+#define CHECK_OCL_CODE_NO_EXC(X) REPORT_OCL_ERR_TO_STREAM(X)
+#endif
+
+#ifndef __has_attribute
+#define __has_attribute(x) 0
+#endif
+
+#if __has_attribute(always_inline)
+#define ALWAYS_INLINE __attribute__((always_inline))
+#else
+#define ALWAYS_INLINE
+#endif
+
+// TODO this macro is introduced to workaround SPIRV translator problem with
+// dropping linkonce_odr attribute leading to duplicated symbol errors in
+// the bitcode linker for functions defined in the headers. Remove once fixed.
+#ifdef __SYCL_DEVICE_ONLY__
+#define INLINE_IF_DEVICE ALWAYS_INLINE
+#else
+#define INLINE_IF_DEVICE
+#endif // __SYCL_DEVICE_ONLY__
+
+
+namespace cl {
+namespace sycl {
+namespace detail {
+// Helper function for extracting implementation from SYCL's interface objects.
+// Note! This function relies on the fact that all SYCL interface classes
+// contain "impl" field that points to implementation object. "impl" field
+// should be accessible from this function.
+template <class T> decltype(T::impl) getSyclObjImpl(const T &SyclObject) {
+  return SyclObject.impl;
+}
+
+// Helper function for creation SYCL interface objects from implementations.
+// Note! This function relies on the fact that all SYCL interface classes
+// contain "impl" field that points to implementation object. "impl" field
+// should be accessible from this function.
+template <class T> T createSyclObjFromImpl(decltype(T::impl) ImplObj) {
+  return T(ImplObj);
+}
+
+#ifdef __SYCL_DEVICE_ONLY__
+// The flag type for passing flag arguments to barrier(), mem_fence(),
+// read_mem_fence(), and write_mem_fence() functions.
+typedef uint cl_mem_fence_flags;
+
+const cl_mem_fence_flags CLK_LOCAL_MEM_FENCE   = 0x01;
+const cl_mem_fence_flags CLK_GLOBAL_MEM_FENCE  = 0x02;
+const cl_mem_fence_flags CLK_CHANNEL_MEM_FENCE = 0x04;
+#endif // __SYCL_DEVICE_ONLY__
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/common_info.hpp b/sycl/include/CL/sycl/detail/common_info.hpp
new file mode 100644
index 000000000000..636dd9a42819
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/common_info.hpp
@@ -0,0 +1,22 @@
+//==------- common_info.hpp ----- Common SYCL info methods------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/stl.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+vector_class<string_class> split_string(const string_class &str,
+                                        char delimeter);
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/context_host.hpp b/sycl/include/CL/sycl/detail/context_host.hpp
new file mode 100644
index 000000000000..0731a22c54f8
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/context_host.hpp
@@ -0,0 +1,46 @@
+//==------------- context_host.hpp - SYCL host context ---------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/detail/context_impl.hpp>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/info/info_desc.hpp>
+#include <CL/sycl/platform.hpp>
+#include <CL/sycl/stl.hpp>
+#include <memory>
+// 4.6.2 Context class
+
+namespace cl {
+namespace sycl {
+namespace detail {
+class context_host : public context_impl {
+public:
+  context_host(const device &rhs, async_handler asyncHandler)
+      : context_impl(asyncHandler), dev(rhs) {}
+
+  cl_context get() const override {
+    throw invalid_object_error("This instance of context is a host instance");
+  }
+
+  bool is_host() const override { return true; }
+
+  platform get_platform() const override { return platform(); }
+
+  vector_class<device> get_devices() const override {
+    return vector_class<device>(1, dev);
+  }
+
+  template <info::context param>
+  typename info::param_traits<info::context, param>::return_type get_info() const;
+private:
+  device dev;
+};
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/context_impl.hpp b/sycl/include/CL/sycl/detail/context_impl.hpp
new file mode 100644
index 000000000000..087d09779f7a
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/context_impl.hpp
@@ -0,0 +1,84 @@
+//==---------------- context.hpp - SYCL context ----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/exception.hpp>
+#include <CL/sycl/info/info_desc.hpp>
+#include <CL/sycl/platform.hpp>
+#include <CL/sycl/stl.hpp>
+// 4.6.2 Context class
+
+namespace cl {
+namespace sycl {
+// Forward declaration
+class platform;
+class device;
+namespace detail {
+template <info::context param> struct get_context_info_cl {
+  using RetType =
+      typename info::param_traits<info::context, param>::return_type;
+
+  static RetType _(cl_context ctx) {
+    RetType Result = 0;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetContextInfo(ctx, cl_context_info(param), sizeof(Result),
+                                    &Result, nullptr));
+    return Result;
+  }
+};
+
+class context_impl {
+public:
+  context_impl(async_handler asyncHandler) : m_AsyncHandler(asyncHandler) {}
+
+  template <info::context param>
+  inline typename info::param_traits<info::context, param>::return_type
+  get_info() const;
+
+  const async_handler& get_async_handler() const { return m_AsyncHandler; }
+
+  virtual cl_context get() const = 0;
+
+  virtual bool is_host() const = 0;
+
+  virtual platform get_platform() const = 0;
+
+  virtual vector_class<device> get_devices() const = 0;
+
+  virtual ~context_impl() = default;
+
+private:
+  async_handler m_AsyncHandler;
+};
+template <>
+inline typename info::param_traits<info::context,
+                                   info::context::reference_count>::return_type
+context_impl::get_info<info::context::reference_count>() const {
+  if (is_host()) {
+    return 0;
+  }
+  return get_context_info_cl<info::context::reference_count>::_(this->get());
+}
+template <>
+inline typename info::param_traits<info::context,
+                                   info::context::platform>::return_type
+context_impl::get_info<info::context::platform>() const {
+  return get_platform();
+}
+template <>
+inline typename info::param_traits<info::context,
+                                   info::context::devices>::return_type
+context_impl::get_info<info::context::devices>() const {
+  return get_devices();
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/context_opencl.hpp b/sycl/include/CL/sycl/detail/context_opencl.hpp
new file mode 100644
index 000000000000..e5755902e29b
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/context_opencl.hpp
@@ -0,0 +1,90 @@
+//==------------ context_opencl.hpp - SYCL OpenCL context ------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/context_impl.hpp>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/device_selector.hpp>
+#include <CL/sycl/platform.hpp>
+#include <CL/sycl/stl.hpp>
+
+// 4.6.2 Context class
+
+namespace cl {
+namespace sycl {
+// Forward declaration
+class platform;
+namespace detail {
+class context_opencl : public context_impl {
+public:
+  context_opencl(const vector_class<cl::sycl::device> devices,
+                 async_handler asyncHandler)
+      : context_impl(asyncHandler) {
+    dev_list = devices;
+    plt = dev_list[0].get_platform();
+    vector_class<cl_device_id> dev_ids;
+    for (const auto &d : dev_list)
+      dev_ids.push_back(d.get());
+    cl_int error;
+    id = clCreateContext(0, dev_ids.size(), dev_ids.data(), 0, 0, &error);
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(error);
+  }
+
+  context_opencl(cl_context clContext, async_handler asyncHandler)
+      : context_impl(asyncHandler) {
+    id = clContext;
+    vector_class<cl_device_id> dev_ids;
+    size_t devicesBuffer = 0;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(
+        clGetContextInfo(id, CL_CONTEXT_DEVICES, 0, nullptr, &devicesBuffer));
+    dev_ids.resize(devicesBuffer / sizeof(cl_device_id));
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetContextInfo(id, CL_CONTEXT_DEVICES, devicesBuffer,
+                                    &dev_ids[0], nullptr));
+
+    for (auto dev : dev_ids) {
+      dev_list.emplace_back(dev);
+    }
+    // TODO What if dev_list if empty? dev_list[0].get_platform()
+    plt = platform(dev_list[0].get_platform());
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clRetainContext(id));
+  }
+
+  cl_context get() const override {
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clRetainContext(id));
+    return id;
+  }
+
+  bool is_host() const override { return false; }
+
+  platform get_platform() const override { return plt; }
+
+  vector_class<device> get_devices() const override { return dev_list; }
+
+  ~context_opencl() {
+    // TODO replace CHECK_OCL_CODE_NO_EXC to CHECK_OCL_CODE and
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE_NO_EXC(clReleaseContext(id));
+  }
+  // TODO: implement param traits
+  // template <info::context param>
+  // typename param_traits<info::context, param>::type get_info() const;
+private:
+  vector_class<device> dev_list;
+  cl_context id;
+  platform plt;
+};
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/device_host.hpp b/sycl/include/CL/sycl/detail/device_host.hpp
new file mode 100644
index 000000000000..6ce172ac435e
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/device_host.hpp
@@ -0,0 +1,66 @@
+//==--------------- device_host.hpp - SYCL host device --------------------== //
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/device_impl.hpp>
+#include <CL/sycl/platform.hpp>
+#include <CL/sycl/stl.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+// TODO: 4.6.4 Partitioning into multiple SYCL devices
+// TODO: 4.6.4.2 Device information descriptors
+// TODO: Make code thread-safe
+class device_host : public device_impl {
+public:
+  device_host() = default;
+  cl_device_id get() const override {
+    throw invalid_object_error("This instance of device is a host instance");
+  }
+
+  bool is_host() const override { return true; }
+
+  bool is_cpu() const override { return false; }
+
+  bool is_gpu() const override { return false; }
+
+  bool is_accelerator() const override { return false; }
+
+  platform get_platform() const override { return platform(); }
+
+  bool has_extension(const string_class &extension_name) const override {
+    // TODO: implement extension management;
+    return false;
+  }
+
+  vector_class<device> create_sub_devices(size_t nbSubDev) const {
+    // TODO: implement host device partitioning
+    throw runtime_error(
+        "Partitioning to subdevices of the host device is not implemented yet");
+  }
+
+  vector_class<device>
+  create_sub_devices(const vector_class<size_t> &counts) const {
+    // TODO: implement host device partitioning
+    throw runtime_error(
+        "Partitioning to subdevices of the host device is not implemented yet");
+  }
+
+  vector_class<device>
+  create_sub_devices(info::partition_affinity_domain affinityDomain) const {
+    // TODO: implement host device partitioning
+    throw runtime_error(
+        "Partitioning to subdevices of the host device is not implemented yet");
+  }
+};
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/device_impl.hpp b/sycl/include/CL/sycl/detail/device_impl.hpp
new file mode 100644
index 000000000000..671a9ed187d7
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/device_impl.hpp
@@ -0,0 +1,83 @@
+//==----------------- device_impl.hpp - SYCL device ------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/device_info.hpp>
+#include <CL/sycl/stl.hpp>
+#include <algorithm>
+#include <memory>
+
+namespace cl {
+namespace sycl {
+
+// Forward declaration
+class platform;
+
+namespace detail {
+// TODO: 4.6.4 Partitioning into multiple SYCL devices
+// TODO: 4.6.4.2 Device information descriptors
+// TODO: Make code thread-safe
+class device_impl {
+public:
+  virtual ~device_impl() = default;
+
+  virtual cl_device_id get() const = 0;
+
+  virtual bool is_host() const = 0;
+
+  virtual bool is_cpu() const = 0;
+
+  virtual bool is_gpu() const = 0;
+
+  virtual bool is_accelerator() const = 0;
+
+  virtual platform get_platform() const = 0;
+
+  virtual vector_class<device> create_sub_devices(size_t nbSubDev) const = 0;
+
+  virtual vector_class<device>
+  create_sub_devices(const vector_class<size_t> &counts) const = 0;
+
+  virtual vector_class<device>
+  create_sub_devices(info::partition_affinity_domain affinityDomain) const = 0;
+
+  static vector_class<device>
+  get_devices(info::device_type deviceType = info::device_type::all);
+
+  template <info::device param>
+  typename info::param_traits<info::device, param>::return_type
+  get_info() const {
+    if (is_host()) {
+      return get_device_info_host<param>();
+    }
+    return get_device_info_cl<
+        typename info::param_traits<info::device, param>::return_type,
+        param>::_(this->get());
+  }
+
+  bool is_partition_supported(info::partition_property Prop) const {
+    auto SupportedProperties = get_info<info::device::partition_properties>();
+    return std::find(SupportedProperties.begin(), SupportedProperties.end(),
+                     Prop) != SupportedProperties.end();
+  }
+
+  bool
+  is_affinity_supported(info::partition_affinity_domain AffinityDomain) const {
+    auto SupportedDomains =
+        get_info<info::device::partition_affinity_domains>();
+    return std::find(SupportedDomains.begin(), SupportedDomains.end(),
+                     AffinityDomain) != SupportedDomains.end();
+  }
+
+  virtual bool has_extension(const string_class &extension_name) const = 0;
+};
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/device_info.hpp b/sycl/include/CL/sycl/detail/device_info.hpp
new file mode 100644
index 000000000000..dc581c31160f
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/device_info.hpp
@@ -0,0 +1,481 @@
+//==-------- device_info.hpp - SYCL device info methods --------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/detail/common_info.hpp>
+#include <CL/sycl/info/info_desc.hpp>
+#include <CL/sycl/platform.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+vector_class<info::fp_config> read_fp_bitfield(cl_device_fp_config bits);
+
+vector_class<info::partition_affinity_domain>
+read_domain_bitfield(cl_device_affinity_domain bits);
+
+vector_class<info::execution_capability>
+read_execution_bitfield(cl_device_exec_capabilities bits);
+
+// Mapping expected SYCL return types to those returned by OpenCL calls
+template <typename T> struct sycl_to_ocl { using type = T; };
+
+template <> struct sycl_to_ocl<bool> { using type = cl_bool; };
+
+template <> struct sycl_to_ocl<device> { using type = cl_device_id; };
+
+template <> struct sycl_to_ocl<platform> { using type = cl_platform_id; };
+
+// Mapping fp_config device info types to the values used to check fp support
+template <info::device param> struct check_fp_support {};
+
+template <> struct check_fp_support<info::device::half_fp_config> {
+  static const info::device value = info::device::native_vector_width_half;
+};
+
+template <> struct check_fp_support<info::device::double_fp_config> {
+  static const info::device value = info::device::native_vector_width_double;
+};
+
+// Structs for emulating function template partial specialization
+// Default template for the general case
+template <typename T, info::device param> struct get_device_info_cl {
+  static T _(cl_device_id dev) {
+    typename sycl_to_ocl<T>::type result;
+    CHECK_OCL_CODE(clGetDeviceInfo(dev, (cl_device_info)param, sizeof(result),
+                                   &result, NULL));
+    return T(result);
+  }
+};
+
+// Specialization for string return type, variable OpenCL return size
+template <info::device param> struct get_device_info_cl<string_class, param> {
+  static string_class _(cl_device_id dev) {
+    size_t resultSize;
+    CHECK_OCL_CODE(
+        clGetDeviceInfo(dev, (cl_device_info)param, 0, NULL, &resultSize));
+    if (resultSize == 0) {
+      return string_class();
+    }
+    unique_ptr_class<char[]> result(new char[resultSize]);
+    CHECK_OCL_CODE(clGetDeviceInfo(dev, (cl_device_info)param, resultSize,
+                                   result.get(), NULL));
+    return string_class(result.get());
+  }
+};
+
+// Specialization for id return type
+template <info::device param> struct get_device_info_cl<id<3>, param> {
+  static id<3> _(cl_device_id dev) {
+    size_t result[3];
+    CHECK_OCL_CODE(clGetDeviceInfo(dev, (cl_device_info)param, sizeof(result),
+                                   &result, NULL));
+    return id<3>(result[0], result[1], result[2]);
+  }
+};
+
+// Specialization for fp_config types, checks the corresponding fp type support
+template <info::device param>
+struct get_device_info_cl<vector_class<info::fp_config>, param> {
+  static vector_class<info::fp_config> _(cl_device_id dev) {
+    // Check if fp type is supported
+    if (!get_device_info_cl<
+            typename info::param_traits<
+                info::device, check_fp_support<param>::value>::return_type,
+            check_fp_support<param>::value>::_(dev)) {
+      return {};
+    }
+    cl_device_fp_config result;
+    CHECK_OCL_CODE(clGetDeviceInfo(dev, (cl_device_info)param, sizeof(result),
+                                   &result, NULL));
+    return read_fp_bitfield(result);
+  }
+};
+
+// Specialization for single_fp_config, no type support check required
+template <>
+struct get_device_info_cl<vector_class<info::fp_config>,
+                          info::device::single_fp_config> {
+  static vector_class<info::fp_config> _(cl_device_id dev) {
+    cl_device_fp_config result;
+    CHECK_OCL_CODE(
+        clGetDeviceInfo(dev, (cl_device_info)info::device::single_fp_config,
+                        sizeof(result), &result, NULL));
+    return read_fp_bitfield(result);
+  }
+};
+
+// Specialization for queue_profiling, OpenCL returns a bitfield
+template <> struct get_device_info_cl<bool, info::device::queue_profiling> {
+  static bool _(cl_device_id dev) {
+    cl_command_queue_properties result;
+    CHECK_OCL_CODE(
+        clGetDeviceInfo(dev, (cl_device_info)info::device::queue_profiling,
+                        sizeof(result), &result, NULL));
+    return (result & CL_QUEUE_PROFILING_ENABLE);
+  }
+};
+
+// Specialization for exec_capabilities, OpenCL returns a bitfield
+template <>
+struct get_device_info_cl<vector_class<info::execution_capability>,
+                          info::device::execution_capabilities> {
+  static vector_class<info::execution_capability> _(cl_device_id dev) {
+    cl_device_exec_capabilities result;
+    CHECK_OCL_CODE(clGetDeviceInfo(
+        dev, (cl_device_info)info::device::execution_capabilities,
+        sizeof(result), &result, NULL));
+    return read_execution_bitfield(result);
+  }
+};
+
+// Specialization for built in kernels, splits the string returned by OpenCL
+template <>
+struct get_device_info_cl<vector_class<string_class>,
+                          info::device::built_in_kernels> {
+  static vector_class<string_class> _(cl_device_id dev) {
+    string_class result =
+        get_device_info_cl<string_class, info::device::built_in_kernels>::_(
+            dev);
+    return split_string(result, ';');
+  }
+};
+
+// Specialization for extensions, splits the string returned by OpenCL
+template <>
+struct get_device_info_cl<vector_class<string_class>,
+                          info::device::extensions> {
+  static vector_class<string_class> _(cl_device_id dev) {
+    string_class result =
+        get_device_info_cl<string_class, info::device::extensions>::_(dev);
+    return split_string(result, ' ');
+  }
+};
+
+// Specialization for partition properties, variable OpenCL return size
+template <>
+struct get_device_info_cl<vector_class<info::partition_property>,
+                          info::device::partition_properties> {
+  static vector_class<info::partition_property> _(cl_device_id dev) {
+    size_t resultSize;
+    CHECK_OCL_CODE(
+        clGetDeviceInfo(dev, (cl_device_info)info::device::partition_properties,
+                        0, NULL, &resultSize));
+    size_t arrayLength = resultSize / sizeof(cl_device_partition_property);
+    if (arrayLength == 0) {
+      return {};
+    }
+    unique_ptr_class<cl_device_partition_property[]> arrayResult(
+        new cl_device_partition_property[arrayLength]);
+    CHECK_OCL_CODE(
+        clGetDeviceInfo(dev, (cl_device_info)info::device::partition_properties,
+                        resultSize, arrayResult.get(), NULL));
+
+    vector_class<info::partition_property> result;
+    for (size_t i = 0; i < arrayLength - 1; ++i) {
+      result.push_back(info::partition_property(arrayResult[i]));
+    }
+    return result;
+  }
+};
+
+// Specialization for partition affinity domains, OpenCL returns a bitfield
+template <>
+struct get_device_info_cl<vector_class<info::partition_affinity_domain>,
+                          info::device::partition_affinity_domains> {
+  static vector_class<info::partition_affinity_domain> _(cl_device_id dev) {
+    cl_device_affinity_domain result;
+    CHECK_OCL_CODE(clGetDeviceInfo(
+        dev, (cl_device_info)info::device::partition_affinity_domains,
+        sizeof(result), &result, NULL));
+    return read_domain_bitfield(result);
+  }
+};
+
+// Specialization for partition type affinity domain, OpenCL can return other
+// partition properties instead
+template <>
+struct get_device_info_cl<info::partition_affinity_domain,
+                          info::device::partition_type_affinity_domain> {
+  static info::partition_affinity_domain _(cl_device_id dev) {
+    size_t resultSize;
+    CHECK_OCL_CODE(clGetDeviceInfo(
+        dev, (cl_device_info)info::device::partition_type_affinity_domain, 0,
+        NULL, &resultSize));
+    if (resultSize != 1) {
+      return info::partition_affinity_domain::not_applicable;
+    }
+    cl_device_partition_property result;
+    CHECK_OCL_CODE(clGetDeviceInfo(
+        dev, (cl_device_info)info::device::partition_type_affinity_domain,
+        sizeof(result), &result, NULL));
+    if (result == CL_DEVICE_AFFINITY_DOMAIN_NUMA ||
+        result == CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE ||
+        result == CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE ||
+        result == CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE ||
+        result == CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE) {
+      return info::partition_affinity_domain(result);
+    }
+
+    return info::partition_affinity_domain::not_applicable;
+  }
+};
+
+// Specialization for partition type
+template <>
+struct get_device_info_cl<info::partition_property,
+                          info::device::partition_type_property> {
+  static info::partition_property _(cl_device_id dev) {
+    size_t resultSize;
+    CHECK_OCL_CODE(
+        clGetDeviceInfo(dev, CL_DEVICE_PARTITION_TYPE, 0, NULL, &resultSize));
+    if (!resultSize)
+      return info::partition_property::no_partition;
+
+    size_t arrayLength = resultSize / sizeof(cl_device_partition_property);
+
+    unique_ptr_class<cl_device_partition_property[]> arrayResult(
+        new cl_device_partition_property[arrayLength]);
+    CHECK_OCL_CODE(clGetDeviceInfo(dev, CL_DEVICE_PARTITION_TYPE, resultSize,
+                                   arrayResult.get(), NULL));
+    if (!arrayResult[0])
+      return info::partition_property::no_partition;
+    return info::partition_property(arrayResult[0]);
+  }
+};
+
+// Specialization for parent device
+template <typename T>
+struct get_device_info_cl<T, info::device::parent_device> {
+  static T _(cl_device_id dev) {
+    typename sycl_to_ocl<T>::type result;
+    CHECK_OCL_CODE(
+        clGetDeviceInfo(dev, (cl_device_info)info::device::parent_device,
+                        sizeof(result), &result, NULL));
+    if (result == nullptr)
+      throw invalid_object_error(
+          "No parent for device because it is not a subdevice");
+    return T(result);
+  }
+};
+
+// SYCL host device information
+
+// Default template is disabled, all possible instantiations are
+// specified explicitly.
+template <info::device param>
+typename info::param_traits<info::device, param>::return_type
+get_device_info_host() = delete;
+
+template <> info::device_type get_device_info_host<info::device::device_type>();
+
+template <> cl_uint get_device_info_host<info::device::vendor_id>();
+
+template <> cl_uint get_device_info_host<info::device::max_compute_units>();
+
+template <>
+cl_uint get_device_info_host<info::device::max_work_item_dimensions>();
+
+template <> id<3> get_device_info_host<info::device::max_work_item_sizes>();
+
+template <> size_t get_device_info_host<info::device::max_work_group_size>();
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_char>();
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_short>();
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_int>();
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_long>();
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_float>();
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_double>();
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_half>();
+
+cl_uint get_native_vector_width(size_t idx);
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_half>();
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_char>();
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_short>();
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_int>();
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_long>();
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_float>();
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_double>();
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_half>();
+
+template <> cl_uint get_device_info_host<info::device::max_clock_frequency>();
+
+template <> cl_uint get_device_info_host<info::device::address_bits>();
+
+template <> cl_ulong get_device_info_host<info::device::global_mem_size>();
+
+template <> cl_ulong get_device_info_host<info::device::max_mem_alloc_size>();
+
+template <> bool get_device_info_host<info::device::image_support>();
+
+template <> cl_uint get_device_info_host<info::device::max_read_image_args>();
+
+template <> cl_uint get_device_info_host<info::device::max_write_image_args>();
+
+template <> size_t get_device_info_host<info::device::image2d_max_width>();
+
+template <> size_t get_device_info_host<info::device::image2d_max_height>();
+
+template <> size_t get_device_info_host<info::device::image3d_max_width>();
+
+template <> size_t get_device_info_host<info::device::image3d_max_height>();
+
+template <> size_t get_device_info_host<info::device::image3d_max_depth>();
+
+template <> size_t get_device_info_host<info::device::image_max_buffer_size>();
+
+template <> size_t get_device_info_host<info::device::image_max_array_size>();
+
+template <> cl_uint get_device_info_host<info::device::max_samplers>();
+
+template <> size_t get_device_info_host<info::device::max_parameter_size>();
+
+template <> cl_uint get_device_info_host<info::device::mem_base_addr_align>();
+
+template <>
+vector_class<info::fp_config>
+get_device_info_host<info::device::half_fp_config>();
+
+template <>
+vector_class<info::fp_config>
+get_device_info_host<info::device::single_fp_config>();
+
+template <>
+vector_class<info::fp_config>
+get_device_info_host<info::device::double_fp_config>();
+
+template <>
+info::global_mem_cache_type
+get_device_info_host<info::device::global_mem_cache_type>();
+
+template <>
+cl_uint get_device_info_host<info::device::global_mem_cache_line_size>();
+
+template <>
+cl_ulong get_device_info_host<info::device::global_mem_cache_size>();
+
+template <>
+cl_ulong get_device_info_host<info::device::max_constant_buffer_size>();
+
+template <> cl_uint get_device_info_host<info::device::max_constant_args>();
+
+template <>
+info::local_mem_type get_device_info_host<info::device::local_mem_type>();
+
+template <> cl_ulong get_device_info_host<info::device::local_mem_size>();
+
+template <> bool get_device_info_host<info::device::error_correction_support>();
+
+template <> bool get_device_info_host<info::device::host_unified_memory>();
+
+template <>
+size_t get_device_info_host<info::device::profiling_timer_resolution>();
+
+template <> bool get_device_info_host<info::device::is_endian_little>();
+
+template <> bool get_device_info_host<info::device::is_available>();
+
+template <> bool get_device_info_host<info::device::is_compiler_available>();
+
+template <> bool get_device_info_host<info::device::is_linker_available>();
+
+template <>
+vector_class<info::execution_capability>
+get_device_info_host<info::device::execution_capabilities>();
+
+template <> bool get_device_info_host<info::device::queue_profiling>();
+
+template <>
+vector_class<string_class>
+get_device_info_host<info::device::built_in_kernels>();
+
+template <> platform get_device_info_host<info::device::platform>();
+
+template <> string_class get_device_info_host<info::device::name>();
+
+template <> string_class get_device_info_host<info::device::vendor>();
+
+template <> string_class get_device_info_host<info::device::driver_version>();
+
+template <> string_class get_device_info_host<info::device::profile>();
+
+template <> string_class get_device_info_host<info::device::version>();
+
+template <> string_class get_device_info_host<info::device::opencl_c_version>();
+
+template <>
+vector_class<string_class> get_device_info_host<info::device::extensions>();
+
+template <> size_t get_device_info_host<info::device::printf_buffer_size>();
+
+template <>
+bool get_device_info_host<info::device::preferred_interop_user_sync>();
+
+template <> device get_device_info_host<info::device::parent_device>();
+
+template <>
+cl_uint get_device_info_host<info::device::partition_max_sub_devices>();
+
+template <>
+vector_class<info::partition_property>
+get_device_info_host<info::device::partition_properties>();
+
+template <>
+vector_class<info::partition_affinity_domain>
+get_device_info_host<info::device::partition_affinity_domains>();
+
+template <>
+info::partition_property
+get_device_info_host<info::device::partition_type_property>();
+
+template <>
+info::partition_affinity_domain
+get_device_info_host<info::device::partition_type_affinity_domain>();
+
+template <> cl_uint get_device_info_host<info::device::reference_count>();
+
+template <> cl_uint get_device_info_host<info::device::max_num_sub_groups>();
+
+template <>
+bool get_device_info_host<
+    info::device::sub_group_independent_forward_progress>();
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/device_opencl.hpp b/sycl/include/CL/sycl/detail/device_opencl.hpp
new file mode 100644
index 000000000000..a61206b8b760
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/device_opencl.hpp
@@ -0,0 +1,143 @@
+//==------------ device_opencl.hpp - SYCL OpenCL device --------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/platform.hpp>
+#include <CL/sycl/stl.hpp>
+
+class device_selector;
+
+namespace cl {
+namespace sycl {
+namespace detail {
+// TODO: 4.6.4 Partitioning into multiple SYCL devices
+// TODO: 4.6.4.2 Device information descriptors
+// TODO: Make code thread-safe
+class device_opencl : public device_impl {
+public:
+  /** Constructs a device class instance using cl device_id of the OpenCL
+   * device. */
+  explicit device_opencl(cl_device_id deviceId) {
+    id = deviceId;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(
+        clGetDeviceInfo(id, CL_DEVICE_TYPE, sizeof(cl_device_type), &type, 0));
+    cl_device_id parent;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetDeviceInfo(id, CL_DEVICE_PARENT_DEVICE,
+                                   sizeof(cl_device_id), &parent, nullptr));
+    isRootDevice = (nullptr == parent);
+    if (!isRootDevice) {
+      // TODO catch an exception and put it to list of asynchronous exceptions
+      CHECK_OCL_CODE(clRetainDevice(id));
+    }
+  }
+
+  ~device_opencl() {
+    if (!isRootDevice) {
+      // TODO replace CHECK_OCL_CODE_NO_EXC to CHECK_OCL_CODE and
+      // TODO catch an exception and put it to list of asynchronous exceptions
+      CHECK_OCL_CODE_NO_EXC(clReleaseDevice(id));
+    }
+  }
+
+  cl_device_id get() const override {
+    if (!isRootDevice) {
+      // TODO catch an exception and put it to list of asynchronous exceptions
+      CHECK_OCL_CODE(clRetainDevice(id));
+    }
+    return id;
+  }
+
+  bool is_host() const override { return false; }
+
+  bool is_cpu() const override { return (type == CL_DEVICE_TYPE_CPU); }
+
+  bool is_gpu() const override { return (type == CL_DEVICE_TYPE_GPU); }
+
+  bool is_accelerator() const override {
+    return (type == CL_DEVICE_TYPE_ACCELERATOR);
+  }
+
+  platform get_platform() const override {
+    cl_platform_id plt_id;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(
+        clGetDeviceInfo(id, CL_DEVICE_PLATFORM, sizeof(plt_id), &plt_id, 0));
+    return platform(plt_id);
+  }
+
+  bool has_extension(const string_class &extension_name) const override {
+    string_class all_extension_names =
+        get_device_info_cl<string_class, info::device::extensions>::_(id);
+    return (all_extension_names.find(extension_name) != std::string::npos);
+  }
+
+  vector_class<device>
+  create_sub_devices(const cl_device_partition_property *Properties,
+                     size_t SubDevicesCount) const {
+    vector_class<cl_device_id> SubDevices(SubDevicesCount);
+    cl_uint ReturnedSubDevices;
+    CHECK_OCL_CODE(clCreateSubDevices(id, Properties, SubDevicesCount,
+                                      SubDevices.data(), &ReturnedSubDevices));
+    return vector_class<device>(SubDevices.begin(), SubDevices.end());
+  }
+
+  vector_class<device> create_sub_devices(size_t ComputeUnits) const {
+    if (!is_partition_supported(info::partition_property::partition_equally)) {
+      throw cl::sycl::feature_not_supported();
+    }
+    size_t SubDevicesCount =
+        get_info<info::device::max_compute_units>() / ComputeUnits;
+    const cl_device_partition_property Properties[3] = {
+        CL_DEVICE_PARTITION_EQUALLY, (cl_device_partition_property)ComputeUnits,
+        0};
+    return create_sub_devices(Properties, SubDevicesCount);
+  }
+
+  vector_class<device>
+  create_sub_devices(const vector_class<size_t> &Counts) const {
+    if (!is_partition_supported(
+            info::partition_property::partition_by_counts)) {
+      throw cl::sycl::feature_not_supported();
+    }
+    static const cl_device_partition_property P[] = {
+        CL_DEVICE_PARTITION_BY_COUNTS, CL_DEVICE_PARTITION_BY_COUNTS_LIST_END,
+        0};
+    vector_class<cl_device_partition_property> Properties(P, P + 3);
+    Properties.insert(Properties.begin() + 1, Counts.begin(), Counts.end());
+    return create_sub_devices(Properties.data(), Counts.size());
+  }
+
+  vector_class<device>
+  create_sub_devices(info::partition_affinity_domain AffinityDomain) const {
+    if (!is_partition_supported(
+            info::partition_property::partition_by_affinity_domain) ||
+        !is_affinity_supported(AffinityDomain)) {
+      throw cl::sycl::feature_not_supported();
+    }
+    const cl_device_partition_property Properties[3] = {
+        CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
+        (cl_device_partition_property)AffinityDomain, 0};
+    size_t SubDevicesCount =
+        get_info<info::device::partition_max_sub_devices>();
+    return create_sub_devices(Properties, SubDevicesCount);
+  }
+
+private:
+  cl_device_id id = 0;
+  cl_device_type type = 0;
+  bool isRootDevice = false;
+};
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/event_impl.hpp b/sycl/include/CL/sycl/detail/event_impl.hpp
new file mode 100644
index 000000000000..833cc335bb32
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/event_impl.hpp
@@ -0,0 +1,59 @@
+//==---------------- event_impl.hpp - SYCL event ---------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/event_info.hpp>
+#include <CL/sycl/stl.hpp>
+
+#include <cassert>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+class event_impl {
+public:
+  event_impl() = default;
+  event_impl(cl_event CLEvent, const context &SyclContext);
+
+  // Threat all devices that don't support interoperability as host devices to
+  // avoid attempts to call method get on such events.
+  bool is_host() const;
+
+  cl_event get() const;
+
+  // Self is needed in order to pass shared_ptr to Scheduler.
+  void wait(std::shared_ptr<cl::sycl::detail::event_impl> Self) const;
+
+  template <info::event_profiling param>
+  typename info::param_traits<info::event_profiling, param>::return_type
+  get_profiling_info() const;
+
+  template <info::event param>
+  typename info::param_traits<info::event, param>::return_type get_info() const;
+
+  ~event_impl();
+
+  void waitInternal() const;
+
+  cl_event &getHandleRef();
+
+  void setIsHostEvent(bool Value);
+
+private:
+  cl_event m_Event = nullptr;
+  bool m_OpenCLInterop = false;
+  bool m_HostEvent = true;
+};
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/event_info.hpp b/sycl/include/CL/sycl/detail/event_info.hpp
new file mode 100644
index 000000000000..56725642d80e
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/event_info.hpp
@@ -0,0 +1,45 @@
+//==---------------- event_info.hpp - SYCL event ---------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/info/info_desc.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+template <info::event_profiling Param> struct get_event_profiling_info_cl {
+  using RetType =
+      typename info::param_traits<info::event_profiling, Param>::return_type;
+
+  static RetType _(cl_event Event) {
+    RetType Result = 0;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetEventProfilingInfo(Event, cl_profiling_info(Param),
+                                           sizeof(Result), &Result, nullptr));
+    return Result;
+  }
+};
+
+template <info::event Param> struct get_event_info_cl {
+  using RetType = typename info::param_traits<info::event, Param>::return_type;
+
+  static RetType _(cl_event Event) {
+    RetType Result = (RetType)0;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetEventInfo(Event, cl_profiling_info(Param),
+                                  sizeof(Result), &Result, nullptr));
+    return Result;
+  }
+};
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/helpers.hpp b/sycl/include/CL/sycl/detail/helpers.hpp
new file mode 100644
index 000000000000..f8e95977eee4
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/helpers.hpp
@@ -0,0 +1,72 @@
+//==---------------- helpers.hpp - SYCL helpers ----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/common.hpp>
+#include <stdexcept>
+#include <type_traits>
+#include <vector>
+
+namespace cl {
+namespace sycl {
+class context;
+class event;
+template <int dimensions, bool with_offset> class item;
+template <int dimensions> class group;
+template <int dimensions> class range;
+template <int dimensions> class id;
+template <int dimensions> class nd_item;
+namespace detail {
+
+// The function returns list of events that can be passed to OpenCL API as
+// dependency list and waits for others.
+std::vector<cl_event>
+getOrWaitEvents(std::vector<cl::sycl::event> DepEvents,
+                cl::sycl::context Context);
+
+void waitEvents(std::vector<cl::sycl::event> DepEvents);
+
+struct Builder {
+  Builder() = delete;
+  template <int dimensions>
+  static group<dimensions> createGroup(const cl::sycl::range<dimensions> &G,
+                                       const cl::sycl::range<dimensions> &L,
+                                       const cl::sycl::id<dimensions> &I) {
+    return cl::sycl::group<dimensions>(G, L, I);
+  }
+
+  template <int dimensions, bool with_offset>
+  static item<dimensions, with_offset> createItem(
+      typename std::enable_if<(with_offset == true),
+                              const cl::sycl::range<dimensions>>::type &R,
+      const cl::sycl::id<dimensions> &I, const cl::sycl::id<dimensions> &O) {
+    return cl::sycl::item<dimensions, with_offset>(R, I, O);
+  }
+
+  template <int dimensions, bool with_offset>
+  static item<dimensions, with_offset> createItem(
+      typename std::enable_if<(with_offset == false),
+                              const cl::sycl::range<dimensions>>::type &R,
+      const cl::sycl::id<dimensions> &I) {
+    return cl::sycl::item<dimensions, with_offset>(R, I);
+  }
+
+  template <int dimensions>
+  static nd_item<dimensions>
+  createNDItem(const cl::sycl::item<dimensions, true> &GL,
+               const cl::sycl::item<dimensions, false> &L,
+               const cl::sycl::group<dimensions> &GR) {
+    return cl::sycl::nd_item<dimensions>(GL, L, GR);
+  }
+};
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/image_impl.hpp b/sycl/include/CL/sycl/detail/image_impl.hpp
new file mode 100644
index 000000000000..a91eaf78ce53
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/image_impl.hpp
@@ -0,0 +1,160 @@
+//==------------ image_impl.hpp --------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+namespace cl {
+namespace sycl {
+
+enum class image_channel_order : unsigned int {
+  a,
+  r,
+  rx,
+  rg,
+  rgx,
+  ra,
+  rgb,
+  rgbx,
+  rgba,
+  argb,
+  bgra,
+  intensity,
+  luminance,
+  abgr
+};
+
+enum class image_channel_type : unsigned int {
+  snorm_int8,
+  snorm_int16,
+  unorm_int8,
+  unorm_int16,
+  unorm_short_565,
+  unorm_short_555,
+  unorm_int_101010,
+  signed_int8,
+  signed_int16,
+  signed_int32,
+  unsigned_int8,
+  unsigned_int16,
+  unsigned_int32,
+  fp16,
+  fp32
+};
+
+namespace detail {
+
+template <int dimensions, typename AllocatorT> class image_impl {
+public:
+  image_impl(image_channel_order order, image_channel_type type,
+             const range<dimensions> &range,
+             const property_list &propList) {
+    assert(!"Not implemented");
+  }
+
+  //image_impl(image_channel_order order, image_channel_type type,
+             //const range<dimensions> &range, AllocatorT allocator,
+             //const property_list &propList = {});
+
+  /* Available only when: dimensions > 1 */
+  // image_impl(image_channel_order order, image_channel_type type,
+  // const range<dimensions> &range, const range<dimensions - 1> &pitch,
+  // const property_list &propList = {});
+
+  /* Available only when: dimensions > 1 */
+  // image_impl(image_channel_order order, image_channel_type type,
+  // const range<dimensions> &range, const range<dimensions - 1> &pitch,
+  // AllocatorT allocator, const property_list &propList = {});
+
+  //image_impl(void *hostPointer, image_channel_order order,
+             //image_channel_type type, const range<dimensions> &range,
+             //const property_list &propList = {});
+
+  //image_impl(void *hostPointer, image_channel_order order,
+             //image_channel_type type, const range<dimensions> &range,
+             //AllocatorT allocator, const property_list &propList = {});
+
+  //image_impl(const void *hostPointer, image_channel_order order,
+             //image_channel_type type, const range<dimensions> &range,
+             //const property_list &propList = {});
+
+  //image_impl(const void *hostPointer, image_channel_order order,
+             //image_channel_type type, const range<dimensions> &range,
+             //AllocatorT allocator, const property_list &propList = {});
+
+  /* Available only when: dimensions > 1 */
+  // image_impl(void *hostPointer, image_channel_order order, image_channel_type
+  // type,
+  // const range<dimensions> &range, range<dimensions - 1> &pitch,
+  // const property_list &propList = {}) {assert(!"Not implemented");}
+
+  /* Available only when: dimensions > 1 */
+  // image_impl(void *hostPointer, image_channel_order order, image_channel_type
+  // type,
+  // const range<dimensions> &range, range<dimensions - 1> &pitch,
+  // AllocatorT allocator, const property_list &propList = {}) {assert(!"Not
+  // implemented");}
+
+  //image_impl(shared_ptr_class<void> &hostPointer, image_channel_order order,
+             //image_channel_type type, const range<dimensions> &range,
+             //const property_list &propList = {});
+
+  //image_impl(shared_ptr_class<void> &hostPointer, image_channel_order order,
+             //image_channel_type type, const range<dimensions> &range,
+             //AllocatorT allocator, const property_list &propList = {});
+
+  /* Available only when: dimensions > 1 */
+  // image_impl(shared_ptr_class<void> &hostPointer, image_channel_order order,
+  // image_channel_type type, const range<dimensions> &range,
+  // const range<dimensions - 1> &pitch, const property_list &propList = {})
+  // {assert(!"Not implemented");}
+
+  /* Available only when: dimensions > 1 */
+  // image_impl(shared_ptr_class<void> &hostPointer, image_channel_order order,
+  // image_channel_type type, const range<dimensions> &range,
+  // const range<dimensions - 1> &pitch, AllocatorT allocator,
+  // const property_list &propList = {}) {assert(!"Not implemented");}
+
+  //image_impl(cl_mem clMemObject, const context &syclContext,
+             //event availableEvent = {});
+
+  /* -- property interface members -- */
+
+  range<dimensions> get_range() const { assert(!"Not implemented"); }
+
+  /* Available only when: dimensions > 1 */
+  range<dimensions - 1> get_pitch() const { assert(!"Not implemented"); }
+
+  size_t get_size() const { assert(!"Not implemented"); return 0;}
+
+  size_t get_count() const { assert(!"Not implemented"); return 0; }
+
+  AllocatorT get_allocator() const { assert(!"Not implemented"); }
+
+  template <typename dataT, access::mode accessMode>
+  accessor<dataT, dimensions, accessMode, access::target::image>
+  get_access(handler &commandGroupHandler) {
+    assert(!"Not implemented");
+  }
+
+  template <typename dataT, access::mode accessMode>
+  accessor<dataT, dimensions, accessMode, access::target::host_image>
+  get_access() {
+    assert(!"Not implemented");
+  }
+
+  // template <typename Destination = std::nullptr_t>
+  // void set_final_data(Destination finalData = std::nullptr);
+
+  void set_write_back(bool flag) { assert(!"Not implemented"); }
+};
+
+} // namespace detail
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/kernel_desc.hpp b/sycl/include/CL/sycl/detail/kernel_desc.hpp
new file mode 100644
index 000000000000..25862ab7dea3
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/kernel_desc.hpp
@@ -0,0 +1,43 @@
+//==----------------------- kernel_desc.hpp --------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===////
+
+#pragma once
+
+#include <CL/sycl/access/access.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+// kernel parameter kinds
+enum class kernel_param_kind_t {
+  kind_accessor,
+  kind_std_layout, // standard layout object parameters
+  kind_sampler
+};
+
+// describes a kernel parameter
+struct kernel_param_desc_t {
+  // parameter kind
+  kernel_param_kind_t kind;
+  // kind == kind_std_layout
+  //   parameter size in bytes (includes padding for structs)
+  // kind == kind_accessor
+  //   access target; possible access targets are defined in access/access.hpp
+  int info;
+  // offset of the captured value of the parameter in the lambda or function
+  // object
+  int offset;
+};
+
+template <class KernelNameType> struct KernelInfo;
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/kernel_impl.hpp b/sycl/include/CL/sycl/detail/kernel_impl.hpp
new file mode 100644
index 000000000000..ea1a9ca0a91e
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/kernel_impl.hpp
@@ -0,0 +1,128 @@
+//==------- kernel_impl.hpp --- SYCL kernel implementation -----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/kernel_info.hpp>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/info/info_desc.hpp>
+
+#include <cassert>
+#include <memory>
+
+namespace cl {
+namespace sycl {
+// Forward declaration
+class program;
+
+namespace detail {
+class program_impl;
+
+class kernel_impl {
+public:
+  kernel_impl(cl_kernel ClKernel, const context &SyclContext);
+
+  kernel_impl(cl_kernel ClKernel, const context &SyclContext,
+              std::shared_ptr<program_impl> ProgramImpl)
+      : ClKernel(ClKernel), Context(SyclContext), ProgramImpl(ProgramImpl) {}
+
+  // Host kernel constructor
+  kernel_impl(const context &SyclContext,
+              std::shared_ptr<program_impl> ProgramImpl)
+      : Context(SyclContext), ProgramImpl(ProgramImpl) {}
+
+  ~kernel_impl() {
+    // TODO replace CHECK_OCL_CODE_NO_EXC to CHECK_OCL_CODE and
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    if (!is_host()) {
+      CHECK_OCL_CODE_NO_EXC(clReleaseKernel(ClKernel));
+    }
+  }
+
+  cl_kernel get() const {
+    if (is_host()) {
+      throw invalid_object_error("This instance of kernel is a host instance");
+    }
+    CHECK_OCL_CODE(clRetainKernel(ClKernel));
+    return ClKernel;
+  }
+
+  bool is_host() const { return Context.is_host(); }
+
+  context get_context() const { return Context; }
+
+  program get_program() const;
+
+  template <info::kernel param>
+  typename info::param_traits<info::kernel, param>::return_type
+  get_info() const {
+    if (is_host()) {
+      // TODO implement
+      assert(0 && "Not implemented");
+    }
+    return get_kernel_info_cl<
+        typename info::param_traits<info::kernel, param>::return_type,
+        param>::_(this->get());
+  }
+
+  template <info::kernel_work_group param>
+  typename info::param_traits<info::kernel_work_group, param>::return_type
+  get_work_group_info(const device &Device) const {
+    if (is_host()) {
+      return get_kernel_work_group_info_host<param>(Device);
+    }
+    return get_kernel_work_group_info_cl<
+        typename info::param_traits<info::kernel_work_group,
+                                    param>::return_type,
+        param>::_(this->get(), Device.get());
+  }
+
+  template <info::kernel_sub_group param>
+  typename info::param_traits<info::kernel_sub_group, param>::return_type
+  get_sub_group_info(const device &Device) const {
+    if (is_host()) {
+      throw runtime_error("Sub-group feature is not supported on HOST device.");
+    }
+    return get_kernel_sub_group_info_cl<
+        typename info::param_traits<info::kernel_sub_group, param>::return_type,
+        param>::_(this->get(), Device.get());
+  }
+
+  template <info::kernel_sub_group param>
+  typename info::param_traits<info::kernel_sub_group, param>::return_type
+  get_sub_group_info(
+      const device &Device,
+      typename info::param_traits<info::kernel_sub_group, param>::input_type
+          Value) const {
+    if (is_host()) {
+      throw runtime_error("Sub-group feature is not supported on HOST device.");
+    }
+    return get_kernel_sub_group_info_with_input_cl<
+        typename info::param_traits<info::kernel_sub_group, param>::return_type,
+        param,
+        typename info::param_traits<info::kernel_sub_group,
+                                    param>::input_type>::_(this->get(),
+                                                           Device.get(), Value);
+  }
+
+private:
+  cl_kernel ClKernel;
+  context Context;
+  std::shared_ptr<program_impl> ProgramImpl;
+};
+
+template <> context kernel_impl::get_info<info::kernel::context>() const;
+
+template <> program kernel_impl::get_info<info::kernel::program>() const;
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/kernel_info.hpp b/sycl/include/CL/sycl/detail/kernel_info.hpp
new file mode 100644
index 000000000000..cbae1fb42edb
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/kernel_info.hpp
@@ -0,0 +1,161 @@
+//==-------- kernel_info.hpp - SYCL kernel info methods --------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/common_info.hpp>
+#include <CL/sycl/info/info_desc.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+// OpenCL kernel information methods
+template <typename T, info::kernel Param> struct get_kernel_info_cl {};
+
+template <info::kernel Param> struct get_kernel_info_cl<string_class, Param> {
+  static string_class _(cl_kernel ClKernel) {
+    size_t ResultSize;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetKernelInfo(ClKernel, cl_kernel_info(Param), 0, nullptr,
+                                   &ResultSize));
+    if (ResultSize == 0) {
+      return "";
+    }
+    string_class Result(ResultSize, ' ');
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetKernelInfo(ClKernel, cl_kernel_info(Param), ResultSize,
+                                   &Result[0], nullptr));
+    return Result;
+  }
+};
+
+template <info::kernel Param> struct get_kernel_info_cl<cl_uint, Param> {
+  static cl_uint _(cl_kernel ClKernel) {
+    cl_uint Result;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetKernelInfo(ClKernel, cl_kernel_info(Param),
+                                   sizeof(cl_uint), &Result, nullptr));
+    return Result;
+  }
+};
+
+// OpenCL kernel work-group methods
+
+template <typename T, info::kernel_work_group Param>
+struct get_kernel_work_group_info_cl {
+  static T _(cl_kernel ClKernel, cl_device_id ClDevice) {
+    T Result;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetKernelWorkGroupInfo(ClKernel, ClDevice,
+                                            cl_kernel_work_group_info(Param),
+                                            sizeof(T), &Result, nullptr));
+    return Result;
+  }
+};
+
+template <info::kernel_work_group Param>
+struct get_kernel_work_group_info_cl<cl::sycl::range<3>, Param> {
+  static cl::sycl::range<3> _(cl_kernel ClKernel, cl_device_id ClDevice) {
+    size_t Result[3];
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetKernelWorkGroupInfo(
+        ClKernel, ClDevice, cl_kernel_work_group_info(Param),
+        sizeof(size_t) * 3, Result, nullptr));
+    return cl::sycl::range<3>(Result[0], Result[1], Result[2]);
+  }
+};
+
+template <info::kernel_work_group Param>
+typename info::param_traits<info::kernel_work_group, Param>::return_type
+get_kernel_work_group_info_host(const cl::sycl::device &Device);
+
+template <>
+cl::sycl::range<3>
+get_kernel_work_group_info_host<info::kernel_work_group::global_work_size>(
+    const cl::sycl::device &Device);
+
+template <>
+size_t
+get_kernel_work_group_info_host<info::kernel_work_group::work_group_size>(
+    const cl::sycl::device &Device);
+
+template <>
+cl::sycl::range<3> get_kernel_work_group_info_host<
+    info::kernel_work_group::compile_work_group_size>(
+    const cl::sycl::device &Device);
+
+template <>
+size_t get_kernel_work_group_info_host<
+    info::kernel_work_group::preferred_work_group_size_multiple>(
+    const cl::sycl::device &Device);
+
+template <>
+cl_ulong
+get_kernel_work_group_info_host<info::kernel_work_group::private_mem_size>(
+    const cl::sycl::device &Device);
+
+// OpenCL kernel sub-group methods
+
+template <typename TOut, info::kernel_sub_group Param>
+struct get_kernel_sub_group_info_cl {
+  static TOut _(cl_kernel ClKernel, cl_device_id ClDevice) {
+    TOut Result;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetKernelSubGroupInfo(
+        ClKernel, ClDevice, cl_kernel_sub_group_info(Param), 0, nullptr,
+        sizeof(TOut), &Result, nullptr));
+    return Result;
+  }
+};
+
+template <typename TOut, info::kernel_sub_group Param, typename TIn>
+struct get_kernel_sub_group_info_with_input_cl {
+  static TOut _(cl_kernel ClKernel, cl_device_id ClDevice, TIn In) {
+    TOut Result;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetKernelSubGroupInfo(
+        ClKernel, ClDevice, cl_kernel_sub_group_info(Param), sizeof(TIn), &In,
+        sizeof(TOut), &Result, nullptr));
+    return Result;
+  }
+};
+
+template <info::kernel_sub_group Param>
+struct get_kernel_sub_group_info_with_input_cl<cl::sycl::range<3>, Param,
+                                               size_t> {
+  static cl::sycl::range<3> _(cl_kernel ClKernel, cl_device_id ClDevice,
+                              size_t In) {
+    size_t Result[3];
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetKernelSubGroupInfo(
+        ClKernel, ClDevice, cl_kernel_sub_group_info(Param), sizeof(size_t),
+        &In, sizeof(size_t) * 3, Result, nullptr));
+    return cl::sycl::range<3>(Result[0], Result[1], Result[2]);
+  }
+};
+
+template <info::kernel_sub_group Param>
+struct get_kernel_sub_group_info_with_input_cl<size_t, Param,
+                                               cl::sycl::range<3>> {
+  static size_t _(cl_kernel ClKernel, cl_device_id ClDevice,
+                              cl::sycl::range<3> In) {
+    size_t Input[3] = {In[0], In[1], In[2]};
+    size_t Result;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetKernelSubGroupInfo(
+        ClKernel, ClDevice, cl_kernel_sub_group_info(Param), sizeof(size_t) * 3,
+        Input, sizeof(size_t), &Result, nullptr));
+    return Result;
+  }
+};
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/platform_host.hpp b/sycl/include/CL/sycl/detail/platform_host.hpp
new file mode 100644
index 000000000000..ceed82b0a5bd
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/platform_host.hpp
@@ -0,0 +1,41 @@
+//==------------ platform_host.hpp - SYCL host platform --------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/detail/platform_impl.hpp>
+
+// 4.6.2 Platform class for host platform
+namespace cl {
+namespace sycl {
+
+// Forward declaration
+class device;
+
+namespace detail {
+// TODO: implement extension management
+// TODO: implement parameters treatment
+
+class platform_host : public platform_impl {
+public:
+  vector_class<device> get_devices(
+      info::device_type dev_type = info::device_type::all) const override;
+
+  bool has_extension(const string_class &extension_name) const override {
+    return false;
+  }
+
+  cl_platform_id get() const override {
+    throw invalid_object_error("This instance of platform is a host instance");
+  }
+
+  bool is_host() const override { return true; }
+}; // class platform_host
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/platform_impl.hpp b/sycl/include/CL/sycl/detail/platform_impl.hpp
new file mode 100644
index 000000000000..2da2488a1f38
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/platform_impl.hpp
@@ -0,0 +1,57 @@
+//==-------------- platform_impl.hpp - SYCL platform -----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/platform_info.hpp>
+#include <CL/sycl/info/info_desc.hpp>
+#include <CL/sycl/stl.hpp>
+
+// 4.6.2 Platform class
+namespace cl {
+namespace sycl {
+
+// Forward declaration
+class device_selector;
+class device;
+
+namespace detail {
+
+class platform_impl {
+public:
+  platform_impl() = default;
+
+  explicit platform_impl(const device_selector &);
+
+  virtual bool has_extension(const string_class &extension_name) const = 0;
+
+  virtual vector_class<device>
+      get_devices(info::device_type = info::device_type::all) const = 0;
+
+  template <info::platform param>
+  typename info::param_traits<info::platform, param>::return_type
+  get_info() const {
+    if (is_host()) {
+      return get_platform_info_host<param>();
+    }
+    return get_platform_info_cl<
+        typename info::param_traits<info::platform, param>::return_type,
+        param>::_(this->get());
+  }
+
+  virtual bool is_host() const = 0;
+
+  virtual cl_platform_id get() const = 0;
+
+  virtual ~platform_impl() = default;
+}; // class platform_impl
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/platform_info.hpp b/sycl/include/CL/sycl/detail/platform_info.hpp
new file mode 100644
index 000000000000..04d97f712c94
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/platform_info.hpp
@@ -0,0 +1,68 @@
+//==------ platform_info.hpp - SYCL platform info methods ------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/common_info.hpp>
+#include <CL/sycl/info/info_desc.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+// OpenCL platform information methods
+template <typename T, info::platform param> struct get_platform_info_cl {};
+
+template <info::platform param>
+struct get_platform_info_cl<string_class, param> {
+  static string_class _(cl_platform_id plt) {
+    size_t resultSize;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(
+        clGetPlatformInfo(plt, cl_platform_info(param), 0, NULL, &resultSize));
+    if (resultSize == 0) {
+      return "";
+    }
+    unique_ptr_class<char[]> result(new char[resultSize]);
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetPlatformInfo(plt, cl_platform_info(param), resultSize,
+                                     result.get(), NULL));
+    return result.get();
+  }
+};
+
+template <>
+struct get_platform_info_cl<vector_class<string_class>,
+                            info::platform::extensions> {
+  static vector_class<string_class> _(cl_platform_id plt) {
+    string_class result =
+        get_platform_info_cl<string_class, info::platform::extensions>::_(plt);
+    return split_string(result, ' ');
+  }
+};
+
+// Host platform information methods
+template <info::platform param>
+typename info::param_traits<info::platform, param>::return_type
+get_platform_info_host() = delete;
+
+template <> string_class get_platform_info_host<info::platform::profile>();
+
+template <> string_class get_platform_info_host<info::platform::version>();
+
+template <> string_class get_platform_info_host<info::platform::name>();
+
+template <> string_class get_platform_info_host<info::platform::vendor>();
+
+template <>
+vector_class<string_class> get_platform_info_host<info::platform::extensions>();
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/platform_opencl.hpp b/sycl/include/CL/sycl/detail/platform_opencl.hpp
new file mode 100644
index 000000000000..47aaf1d459d5
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/platform_opencl.hpp
@@ -0,0 +1,45 @@
+//==-------- platform_opencl.hpp - SYCL OpenCL platform --------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/detail/platform_impl.hpp>
+
+// 4.6.2 Platform class for opencl platform
+namespace cl {
+namespace sycl {
+
+// Forward declaration
+class device_selector;
+class device;
+
+namespace detail {
+// TODO: implement parameters treatment
+class platform_opencl : public platform_impl {
+public:
+  platform_opencl(cl_platform_id platform_id) : id(platform_id) {}
+
+  vector_class<device> get_devices(
+      info::device_type deviceType = info::device_type::all) const override;
+
+  bool has_extension(const string_class &extension_name) const override {
+    string_class all_extension_names =
+        get_platform_info_cl<string_class, info::platform::extensions>::_(id);
+    return (all_extension_names.find(extension_name) != std::string::npos);
+  }
+
+  cl_platform_id get() const override { return id; }
+
+  bool is_host() const override { return false; }
+
+private:
+  cl_platform_id id = 0;
+}; // class platform_opencl
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/program_impl.hpp b/sycl/include/CL/sycl/detail/program_impl.hpp
new file mode 100644
index 000000000000..d10f1c636bee
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/program_impl.hpp
@@ -0,0 +1,378 @@
+//==----- program_impl.hpp --- SYCL program implementation -----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/program_manager/program_manager.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/common_info.hpp>
+#include <CL/sycl/detail/kernel_desc.hpp>
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/kernel.hpp>
+#include <CL/sycl/stl.hpp>
+
+#include <fstream>
+#include <memory>
+
+namespace cl {
+namespace sycl {
+
+enum class program_state { none, compiled, linked };
+
+namespace detail {
+
+class program_impl {
+public:
+  program_impl() = delete;
+
+  explicit program_impl(const context &Context)
+      : program_impl(Context, Context.get_devices()) {}
+
+  program_impl(const context &Context, vector_class<device> DeviceList)
+      : Context(Context), Devices(DeviceList) {}
+
+  program_impl(vector_class<std::shared_ptr<program_impl>> ProgramList,
+               string_class LinkOptions = "")
+      : State(program_state::linked), LinkOptions(LinkOptions) {
+    // Verify arguments
+    if (ProgramList.empty()) {
+      throw runtime_error("Non-empty vector of programs expected");
+    }
+    Context = ProgramList[0]->Context;
+    Devices = ProgramList[0]->Devices;
+    for (const auto &Prg : ProgramList) {
+      Prg->throw_if_state_is_not(program_state::compiled);
+      if (Prg->Context != Context) {
+        throw invalid_object_error(
+            "Not all programs are associated with the same context");
+      }
+      if (Prg->Devices != Devices) {
+        throw invalid_object_error(
+            "Not all programs are associated with the same devices");
+      }
+    }
+
+    if (!is_host()) {
+      vector_class<cl_device_id> ClDevices(get_cl_devices());
+      vector_class<cl_program> ClPrograms;
+      for (const auto &Prg : ProgramList) {
+        ClPrograms.push_back(Prg->ClProgram);
+      }
+      cl_int Err;
+      ClProgram =
+          clLinkProgram(Context.get(), ClDevices.size(), ClDevices.data(),
+                        LinkOptions.c_str(), ProgramList.size(),
+                        ClPrograms.data(), nullptr, nullptr, &Err);
+      CHECK_OCL_CODE_THROW(Err, compile_program_error);
+    }
+  }
+
+  program_impl(const context &Context, cl_program ClProgram)
+      : ClProgram(ClProgram), Context(Context) {
+    // TODO it's unclear how to handle getting compile, link and build options
+    // in this case
+    // TODO handle the case when cl_program build is in progress
+    cl_uint NumDevices;
+    CHECK_OCL_CODE(clGetProgramInfo(ClProgram, CL_PROGRAM_NUM_DEVICES,
+                                    sizeof(cl_uint), &NumDevices, nullptr));
+    vector_class<cl_device_id> ClDevices(NumDevices);
+    CHECK_OCL_CODE(clGetProgramInfo(ClProgram, CL_PROGRAM_DEVICES,
+                                    sizeof(cl_device_id) * NumDevices,
+                                    ClDevices.data(), nullptr));
+    Devices = vector_class<device>(ClDevices.begin(), ClDevices.end());
+    // TODO check build for each device instead
+    cl_program_binary_type BinaryType;
+    CHECK_OCL_CODE(clGetProgramBuildInfo(
+        ClProgram, Devices[0].get(), CL_PROGRAM_BINARY_TYPE,
+        sizeof(cl_program_binary_type), &BinaryType, nullptr));
+    switch (BinaryType) {
+    case CL_PROGRAM_BINARY_TYPE_NONE:
+      State = program_state::none;
+      break;
+    case CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT:
+      State = program_state::compiled;
+      break;
+    case CL_PROGRAM_BINARY_TYPE_LIBRARY:
+    case CL_PROGRAM_BINARY_TYPE_EXECUTABLE:
+      State = program_state::linked;
+    }
+    CHECK_OCL_CODE(clRetainProgram(ClProgram));
+  }
+
+  program_impl(const context &Context, cl_kernel ClKernel)
+      : program_impl(
+            Context,
+            ProgramManager::getInstance().getClProgramFromClKernel(ClKernel)) {}
+
+  ~program_impl() {
+    // TODO replace CHECK_OCL_CODE_NO_EXC to CHECK_OCL_CODE and
+    // catch an exception and put it to list of asynchronous exceptions
+    if (!is_host() && ClProgram != nullptr) {
+      CHECK_OCL_CODE_NO_EXC(clReleaseProgram(ClProgram));
+    }
+  }
+
+  cl_program get() const {
+    throw_if_state_is(program_state::none);
+    if (is_host()) {
+      throw invalid_object_error("This instance of program is a host instance");
+    }
+    CHECK_OCL_CODE(clRetainProgram(ClProgram));
+    return ClProgram;
+  }
+
+  bool is_host() const { return Context.is_host(); }
+
+  template <typename KernelT>
+  void compile_with_kernel_type(string_class CompileOptions = "") {
+    throw_if_state_is_not(program_state::none);
+    // TODO Check for existence of kernel
+    if (!is_host()) {
+      create_cl_program_with_il();
+      compile(CompileOptions);
+    }
+    State = program_state::compiled;
+  }
+
+  void compile_with_source(string_class KernelSource,
+                           string_class CompileOptions = "") {
+    throw_if_state_is_not(program_state::none);
+    // TODO should it throw if it's host?
+    if (!is_host()) {
+      create_cl_program_with_source(KernelSource);
+      compile(CompileOptions);
+    }
+    State = program_state::compiled;
+  }
+
+  template <typename KernelT>
+  void build_with_kernel_type(string_class BuildOptions = "") {
+    throw_if_state_is_not(program_state::none);
+    // TODO Check for existence of kernel
+    if (!is_host()) {
+      create_cl_program_with_il();
+      build(BuildOptions);
+    }
+    State = program_state::linked;
+  }
+
+  void build_with_source(string_class KernelSource,
+                         string_class BuildOptions = "") {
+    throw_if_state_is_not(program_state::none);
+    // TODO should it throw if it's host?
+    if (!is_host()) {
+      create_cl_program_with_source(KernelSource);
+      build(BuildOptions);
+    }
+    State = program_state::linked;
+  }
+
+  void link(string_class LinkOptions = "") {
+    throw_if_state_is_not(program_state::compiled);
+    if (!is_host()) {
+      vector_class<cl_device_id> ClDevices(get_cl_devices());
+      cl_int Err;
+      ClProgram = clLinkProgram(Context.get(), ClDevices.size(),
+                                ClDevices.data(), LinkOptions.c_str(), 1,
+                                &ClProgram, nullptr, nullptr, &Err);
+      CHECK_OCL_CODE_THROW(Err, compile_program_error);
+      LinkOptions = LinkOptions;
+    }
+    State = program_state::linked;
+  }
+
+  template <typename KernelT>
+  bool has_kernel() const
+#ifdef __SYCL_DEVICE_ONLY__
+      ;
+#else
+  {
+    throw_if_state_is(program_state::none);
+    if (is_host()) {
+      return true;
+    }
+    return has_cl_kernel(KernelInfo<KernelT>::getName());
+  }
+#endif
+
+  bool has_kernel(string_class KernelName) const {
+    throw_if_state_is(program_state::none);
+    if (is_host()) {
+      return false;
+    }
+    return has_cl_kernel(KernelName);
+  }
+
+  template <typename KernelT>
+  kernel get_kernel(std::shared_ptr<program_impl> PtrToSelf) const
+#ifdef __SYCL_DEVICE_ONLY__
+      ;
+#else
+  {
+    throw_if_state_is(program_state::none);
+    if (is_host()) {
+      return createSyclObjFromImpl<kernel>(
+          std::make_shared<kernel_impl>(Context, PtrToSelf));
+    }
+    return createSyclObjFromImpl<kernel>(std::make_shared<kernel_impl>(
+        get_cl_kernel(KernelInfo<KernelT>::getName()), Context, PtrToSelf));
+  }
+#endif
+
+  kernel get_kernel(string_class KernelName,
+                    std::shared_ptr<program_impl> PtrToSelf) const {
+    throw_if_state_is(program_state::none);
+    if (is_host()) {
+      throw invalid_object_error("This instance of program is a host instance");
+    }
+    return createSyclObjFromImpl<kernel>(std::make_shared<kernel_impl>(
+        get_cl_kernel(KernelName), Context, PtrToSelf));
+  }
+
+  template <info::program param>
+  typename info::param_traits<info::program, param>::return_type
+  get_info() const;
+
+  vector_class<vector_class<char>> get_binaries() const {
+    throw_if_state_is(program_state::none);
+    vector_class<size_t> BinarySizes(Devices.size());
+    CHECK_OCL_CODE(clGetProgramInfo(ClProgram, CL_PROGRAM_BINARY_SIZES,
+                                    sizeof(size_t) * BinarySizes.size(),
+                                    BinarySizes.data(), nullptr));
+
+    vector_class<vector_class<char>> Result;
+    vector_class<char *> Pointers;
+    for (size_t I = 0; I < BinarySizes.size(); ++I) {
+      Result.emplace_back(BinarySizes[I]);
+      Pointers.push_back(Result[I].data());
+    }
+    CHECK_OCL_CODE(clGetProgramInfo(ClProgram, CL_PROGRAM_BINARIES,
+                                    sizeof(char *) * Pointers.size(),
+                                    Pointers.data(), nullptr));
+    return Result;
+  }
+
+  context get_context() const { return Context; }
+
+  vector_class<device> get_devices() const { return Devices; }
+
+  string_class get_compile_options() const { return CompileOptions; }
+
+  string_class get_link_options() const { return LinkOptions; }
+
+  string_class get_build_options() const { return BuildOptions; }
+
+  program_state get_state() const { return State; }
+
+private:
+  void create_cl_program_with_il() {
+    assert(!ClProgram && "This program already has an encapsulated cl_program");
+    ClProgram = ProgramManager::getInstance().getBuiltOpenCLProgram(Context);
+  }
+
+  void create_cl_program_with_source(const string_class &Source) {
+    assert(!ClProgram && "This program already has an encapsulated cl_program");
+    cl_int Err;
+    const char *Src = Source.c_str();
+    size_t Size = Source.size();
+    ClProgram = clCreateProgramWithSource(Context.get(), 1, &Src, &Size, &Err);
+    CHECK_OCL_CODE(Err);
+  }
+
+  void compile(const string_class &Options) {
+    vector_class<cl_device_id> ClDevices(get_cl_devices());
+    // TODO make the exception message more descriptive
+    if (clCompileProgram(ClProgram, ClDevices.size(), ClDevices.data(),
+                         Options.c_str(), 0, nullptr, nullptr, nullptr,
+                         nullptr) != CL_SUCCESS) {
+      throw compile_program_error("Program compilation error");
+    }
+    CompileOptions = Options;
+  }
+
+  void build(const string_class &Options) {
+    vector_class<cl_device_id> ClDevices(get_cl_devices());
+    // TODO make the exception message more descriptive
+    if (clBuildProgram(ClProgram, ClDevices.size(), ClDevices.data(),
+                       Options.c_str(), nullptr, nullptr) != CL_SUCCESS) {
+      throw compile_program_error("Program build error");
+    }
+    BuildOptions = Options;
+  }
+
+  vector_class<cl_device_id> get_cl_devices() const {
+    vector_class<cl_device_id> ClDevices;
+    for (const auto &Device : Devices) {
+      ClDevices.push_back(Device.get());
+    }
+    return ClDevices;
+  }
+
+  bool has_cl_kernel(const string_class &KernelName) const {
+    size_t Size;
+    CHECK_OCL_CODE(clGetProgramInfo(ClProgram, CL_PROGRAM_KERNEL_NAMES, 0,
+                                    nullptr, &Size));
+    string_class ClResult(Size, ' ');
+    CHECK_OCL_CODE(clGetProgramInfo(ClProgram, CL_PROGRAM_KERNEL_NAMES,
+                                    ClResult.size(), &ClResult[0], nullptr));
+    // Get rid of the null terminator
+    ClResult.pop_back();
+    vector_class<string_class> KernelNames(split_string(ClResult, ';'));
+    for (const auto &Name : KernelNames) {
+      if (Name == KernelName) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  cl_kernel get_cl_kernel(const string_class &KernelName) const {
+    cl_int Err;
+    cl_kernel ClKernel = clCreateKernel(ClProgram, KernelName.c_str(), &Err);
+    if (Err == CL_INVALID_KERNEL_NAME) {
+      throw invalid_object_error(
+          "This instance of program does not contain the kernel requested");
+    }
+    CHECK_OCL_CODE(Err);
+    return ClKernel;
+  }
+
+  void throw_if_state_is(program_state State) const {
+    if (this->State == State) {
+      throw invalid_object_error("Invalid program state");
+    }
+  }
+
+  void throw_if_state_is_not(program_state State) const {
+    if (this->State != State) {
+      throw invalid_object_error("Invalid program state");
+    }
+  }
+
+  cl_program ClProgram = nullptr;
+  program_state State = program_state::none;
+  context Context;
+  vector_class<device> Devices;
+  string_class CompileOptions;
+  string_class LinkOptions;
+  string_class BuildOptions;
+};
+
+template <>
+cl_uint program_impl::get_info<info::program::reference_count>() const;
+
+template <> context program_impl::get_info<info::program::context>() const;
+
+template <>
+vector_class<device> program_impl::get_info<info::program::devices>() const;
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/program_manager/program_manager.hpp b/sycl/include/CL/sycl/detail/program_manager/program_manager.hpp
new file mode 100644
index 000000000000..56376d7848f8
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/program_manager/program_manager.hpp
@@ -0,0 +1,86 @@
+//==------ program_manager.hpp --- SYCL program manager---------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/stl.hpp>
+
+#include <map>
+#include <vector>
+
+/// This struct is a record of the device image information
+struct __tgt_device_image {
+  void *ImageStart;                  // Pointer to the target code start
+  void *ImageEnd;                    // Pointer to the target code end
+};
+
+/// This struct is a record of all the host code that may be offloaded to a
+/// target.
+struct __tgt_bin_desc {
+  int32_t NumDeviceImages;           // Number of device types supported
+  __tgt_device_image *DeviceImages;  // Array of device images (1 per dev. type)
+};
+
+// +++ Entry points referenced by the offload wrapper object {
+
+/// Executed as a part of current module's (.exe, .dll) static initialization.
+/// Registers device executable images with the runtime.
+extern "C" void __tgt_register_lib(__tgt_bin_desc *desc);
+
+/// Executed as a part of current module's (.exe, .dll) static
+/// de-initialization.
+/// Unregisters device executable images with the runtime.
+extern "C" void __tgt_unregister_lib(__tgt_bin_desc *desc);
+
+// +++ }
+
+namespace cl {
+namespace sycl {
+class context;
+namespace detail {
+
+// Provides single loading and building OpenCL programs with unique contexts
+// that is necessary for no interoperability cases with lambda.
+class ProgramManager {
+public:
+  static ProgramManager &getInstance();
+  cl_program getBuiltOpenCLProgram(const context &Context);
+  cl_kernel getOrCreateKernel(const context &Context, const char *KernelName);
+  cl_program getClProgramFromClKernel(cl_kernel ClKernel);
+
+  void setDeviceImages(__tgt_bin_desc *_DeviceImages) {
+    // TODO thread-unsafe, see comments in __tgt_register_lib
+    DeviceImages = _DeviceImages;
+  }
+
+private:
+  const vector_class<char> getSpirvSource();
+  void build(cl_program &ClProgram, const string_class &Options = "",
+             std::vector<cl_device_id> ClDevices = std::vector<cl_device_id>());
+
+  struct ContextLess {
+    bool operator()(const context &LHS, const context &RHS) const;
+  };
+
+  ProgramManager() : DeviceImages(nullptr) {}
+  ~ProgramManager() = default;
+  ProgramManager(ProgramManager const &) = delete;
+  ProgramManager &operator=(ProgramManager const &) = delete;
+
+  unique_ptr_class<vector_class<char>> m_SpirvSource;
+  std::map<context, cl_program, ContextLess> m_CachedSpirvPrograms;
+  std::map<cl_program, std::map<string_class, cl_kernel>> m_CachedKernels;
+
+  /// Device executable images available in this module (.exe or .dll).
+  __tgt_bin_desc *DeviceImages;
+};
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/queue_impl.hpp b/sycl/include/CL/sycl/detail/queue_impl.hpp
new file mode 100644
index 000000000000..1c2782bbd53a
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/queue_impl.hpp
@@ -0,0 +1,170 @@
+//==------------------ queue_impl.hpp - SYCL queue -------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/scheduler/scheduler.h>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/event.hpp>
+#include <CL/sycl/handler.hpp>
+#include <CL/sycl/property_list.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+class queue_impl {
+public:
+  queue_impl(const device &SyclDevice, async_handler AsyncHandler,
+             const property_list &PropList)
+      : m_Device(SyclDevice), m_Context(m_Device), m_AsyncHandler(AsyncHandler),
+        m_PropList(PropList), m_HostQueue(m_Device.is_host()) {
+    m_OpenCLInterop = !m_HostQueue;
+    if (!m_HostQueue) {
+      cl_command_queue_properties CreationFlags =
+          CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+
+      if (m_PropList.has_property<property::queue::enable_profiling>()) {
+        CreationFlags |= CL_QUEUE_PROFILING_ENABLE;
+      }
+
+      cl_int Error = CL_SUCCESS;
+#ifdef CL_VERSION_2_0
+      vector_class<cl_queue_properties> CreationFlagProperties = {
+          CL_QUEUE_PROPERTIES, CreationFlags, 0};
+      m_CommandQueue = clCreateCommandQueueWithProperties(
+          m_Context.get(), m_Device.get(), CreationFlagProperties.data(),
+          &Error);
+#else
+      m_CommandQueue = clCreateCommandQueue(m_Context.get(), m_Device.get(),
+                                            CreationFlags, &Error);
+#endif
+      CHECK_OCL_CODE(Error);
+      // TODO catch an exception and put it to list of asynchronous exceptions
+    }
+  }
+
+  queue_impl(cl_command_queue CLQueue, const context &SyclContext,
+             const async_handler &AsyncHandler)
+      : m_Context(SyclContext), m_AsyncHandler(AsyncHandler),
+        m_CommandQueue(CLQueue), m_OpenCLInterop(true), m_HostQueue(false) {
+
+    cl_device_id CLDevice = nullptr;
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clGetCommandQueueInfo(m_CommandQueue, CL_QUEUE_DEVICE,
+                                         sizeof(CLDevice), &CLDevice, nullptr));
+    m_Device = device(CLDevice);
+    // TODO catch an exception and put it to list of asynchronous exceptions
+    CHECK_OCL_CODE(clRetainCommandQueue(m_CommandQueue));
+  }
+
+  ~queue_impl() {
+    if (m_OpenCLInterop) {
+      CHECK_OCL_CODE_NO_EXC(clReleaseCommandQueue(m_CommandQueue));
+    }
+  }
+
+  cl_command_queue get() {
+    if (m_OpenCLInterop) {
+      CHECK_OCL_CODE(clRetainCommandQueue(m_CommandQueue));
+      return m_CommandQueue;
+    }
+    throw invalid_object_error(
+        "This instance of queue doesn't support OpenCL interoperability");
+  }
+
+  context get_context() const { return m_Context; }
+
+  device get_device() const { return m_Device; }
+
+  bool is_host() const { return m_HostQueue; }
+
+  template <info::queue param>
+  typename info::param_traits<info::queue, param>::return_type get_info() const;
+
+  template <typename T> event submit(T cgf, std::shared_ptr<queue_impl> self,
+                                     std::shared_ptr<queue_impl> second_queue) {
+    event Event;
+    try {
+      Event = submit_impl(cgf, self);
+    } catch (...) {
+      m_Exceptions.push_back(std::current_exception());
+      Event = second_queue->submit(cgf, second_queue);
+    }
+    return Event;
+  }
+
+  template <typename T> event submit(T cgf, std::shared_ptr<queue_impl> self) {
+    event Event;
+    try {
+      Event = submit_impl(cgf, self);
+    } catch(...) {
+      m_Exceptions.push_back(std::current_exception());
+    }
+    return Event;
+  }
+
+  void wait() {
+    // TODO: Make thread safe.
+    for (auto &evnt : m_Events)
+      evnt.wait();
+    m_Events.clear();
+  }
+
+  exception_list getExceptionList() const { return m_Exceptions; }
+
+  void wait_and_throw() {
+    wait();
+    throw_asynchronous();
+  }
+
+  void throw_asynchronous() {
+    if (m_AsyncHandler && m_Exceptions.size()) {
+      m_AsyncHandler(m_Exceptions);
+    }
+    m_Exceptions.clear();
+  }
+
+  cl_command_queue &getHandleRef() { return m_CommandQueue; }
+
+  template <typename propertyT> bool has_property() const {
+    return m_PropList.has_property<propertyT>();
+  }
+
+  template <typename propertyT> propertyT get_property() const {
+    return m_PropList.get_property<propertyT>();
+  }
+
+private:
+  template <typename T>
+  event submit_impl(T cgf, std::shared_ptr<queue_impl> self) {
+    handler Handler(std::move(self), m_HostQueue);
+    cgf(Handler);
+    event Event = Handler.finalize();
+    // TODO: Make thread safe.
+    m_Events.push_back(Event);
+    return Event;
+  }
+
+  device m_Device;
+  context m_Context;
+  vector_class<event> m_Events;
+  exception_list m_Exceptions;
+  async_handler m_AsyncHandler;
+  property_list m_PropList;
+
+  cl_command_queue m_CommandQueue = nullptr;
+  bool m_OpenCLInterop = false;
+  bool m_HostQueue = false;
+};
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/scheduler/commands.cpp b/sycl/include/CL/sycl/detail/scheduler/commands.cpp
new file mode 100644
index 000000000000..09f7f09342fc
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/scheduler/commands.cpp
@@ -0,0 +1,161 @@
+//==----------- commands.cpp -----------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/detail/helpers.hpp>
+#include <CL/sycl/detail/program_manager/program_manager.hpp>
+#include <CL/sycl/detail/scheduler/requirements.h>
+#include <CL/sycl/exception.hpp>
+
+#include <cassert>
+
+namespace csd = cl::sycl::detail;
+
+namespace cl {
+namespace sycl {
+namespace simple_scheduler {
+
+template <typename Dst, typename Src>
+const Dst *getParamAddress(const Src *ptr, uint64_t Offset) {
+  return reinterpret_cast<const Dst *>((const char *)ptr + Offset);
+}
+
+template <typename KernelType, int Dimensions, typename RangeType,
+          typename KernelArgType, bool SingleTask>
+void ExecuteKernelCommand<
+    KernelType, Dimensions, RangeType, KernelArgType,
+    SingleTask>::executeKernel(std::vector<cl::sycl::event> DepEvents,
+                               EventImplPtr Event) {
+  if (m_Queue->is_host()) {
+    detail::waitEvents(DepEvents);
+    Event->setIsHostEvent(true);
+    return runOnHost();
+  }
+
+  if (!m_ClKernel) {
+    m_ClKernel = detail::ProgramManager::getInstance().getOrCreateKernel(
+        m_Queue->get_context(), m_KernelName.c_str());
+  }
+
+  if (m_KernelArgs != nullptr) {
+    for (unsigned I = 0; I < m_KernelArgsNum; ++I) {
+      switch (m_KernelArgs[I].kind) {
+      case csd::kernel_param_kind_t::kind_std_layout: {
+        const void *Ptr =
+            getParamAddress<void>(&m_HostKernel, m_KernelArgs[I].offset);
+        CHECK_OCL_CODE(
+            clSetKernelArg(m_ClKernel, I, m_KernelArgs[I].info, Ptr));
+        break;
+      }
+      case csd::kernel_param_kind_t::kind_accessor: {
+        switch (static_cast<cl::sycl::access::target>(m_KernelArgs[I].info)) {
+        case cl::sycl::access::target::global_buffer:
+        case cl::sycl::access::target::constant_buffer: {
+          auto *Ptr =
+              *(getParamAddress<cl::sycl::detail::buffer_impl<char, 1> *>(
+                  &m_HostKernel, m_KernelArgs[I].offset));
+          cl_mem CLBuf = Ptr->getOpenCLMem();
+          CHECK_OCL_CODE(clSetKernelArg(m_ClKernel, I, sizeof(cl_mem), &CLBuf));
+          break;
+        }
+        case cl::sycl::access::target::local: {
+          auto *Ptr =
+              getParamAddress<size_t>(&m_HostKernel, m_KernelArgs[I].offset);
+          CHECK_OCL_CODE(clSetKernelArg(m_ClKernel, I, *Ptr, nullptr));
+          break;
+        }
+        // TODO handle these cases
+        case cl::sycl::access::target::image:
+        case cl::sycl::access::target::host_buffer:
+        case cl::sycl::access::target::host_image:
+        case cl::sycl::access::target::image_array:
+          assert(0);
+        }
+        break;
+      }
+      // TODO implement
+      case csd::kernel_param_kind_t::kind_sampler:
+        assert(0);
+      }
+    }
+  }
+  for (const auto &Arg : m_InteropArgs) {
+    if (Arg.m_Ptr.get() != nullptr) {
+      CHECK_OCL_CODE(clSetKernelArg(m_ClKernel, Arg.m_ArgIndex, Arg.m_Size,
+                                    Arg.m_Ptr.get()));
+    } else {
+      cl_mem CLBuf = Arg.m_BufReq->getCLMemObject();
+      CHECK_OCL_CODE(
+          clSetKernelArg(m_ClKernel, Arg.m_ArgIndex, sizeof(cl_mem), &CLBuf));
+    }
+  }
+
+  std::vector<cl_event> CLEvents =
+      detail::getOrWaitEvents(std::move(DepEvents), m_Queue->get_context());
+  cl_event &CLEvent = Event->getHandleRef();
+  CLEvent = runEnqueueNDRangeKernel(m_Queue->getHandleRef(), m_ClKernel,
+                                    std::move(CLEvents));
+  Event->setIsHostEvent(false);
+}
+
+template <typename KernelType, int Dimensions, typename RangeType,
+          typename KernelArgType, bool SingleTask>
+template <typename R>
+typename std::enable_if<std::is_same<R, range<Dimensions>>::value,
+                        cl_event>::type
+ExecuteKernelCommand<
+    KernelType, Dimensions, RangeType, KernelArgType,
+    SingleTask>::runEnqueueNDRangeKernel(cl_command_queue &EnvQueue,
+                                         cl_kernel &Kernel,
+                                         std::vector<cl_event> CLEvents) {
+  size_t GlobalWorkSize[Dimensions];
+  size_t GlobalWorkOffset[Dimensions];
+  for (int I = 0; I < Dimensions; I++) {
+    GlobalWorkSize[I] = m_WorkItemsRange[I];
+    GlobalWorkOffset[I] = m_WorkItemsOffset[I];
+  }
+  cl_event CLEvent;
+  cl_int error = clEnqueueNDRangeKernel(
+      EnvQueue, Kernel, Dimensions, GlobalWorkOffset, GlobalWorkSize, nullptr,
+      CLEvents.size(), CLEvents.data(), &CLEvent);
+  CHECK_OCL_CODE(error);
+  return CLEvent;
+}
+
+template <typename KernelType, int Dimensions, typename RangeType,
+          typename KernelArgType, bool SingleTask>
+template <typename R>
+typename std::enable_if<std::is_same<R, nd_range<Dimensions>>::value,
+                        cl_event>::type
+ExecuteKernelCommand<
+    KernelType, Dimensions, RangeType, KernelArgType,
+    SingleTask>::runEnqueueNDRangeKernel(cl_command_queue &EnvQueue,
+                                         cl_kernel &Kernel,
+                                         std::vector<cl_event> CLEvents) {
+  size_t GlobalWorkSize[Dimensions];
+  size_t LocalWorkSize[Dimensions];
+  size_t GlobalWorkOffset[Dimensions];
+  for (int I = 0; I < Dimensions; I++) {
+    GlobalWorkSize[I] = m_WorkItemsRange.get_global_range()[I];
+    LocalWorkSize[I] = m_WorkItemsRange.get_local_range()[I];
+    GlobalWorkOffset[I] = m_WorkItemsRange.get_offset()[I];
+  }
+  cl_event CLEvent;
+  cl_int Err = clEnqueueNDRangeKernel(
+      EnvQueue, Kernel, Dimensions, GlobalWorkOffset, GlobalWorkSize,
+      LocalWorkSize, CLEvents.size(), CLEvents.data(), &CLEvent);
+  CHECK_OCL_CODE(Err);
+  return CLEvent;
+}
+
+} // namespace simple_scheduler
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/scheduler/commands.h b/sycl/include/CL/sycl/detail/scheduler/commands.h
new file mode 100644
index 000000000000..ab038e6e37dd
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/scheduler/commands.h
@@ -0,0 +1,400 @@
+//==----------- commands.h -------------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <climits>
+#include <iostream>
+#include <utility>
+
+#include <CL/sycl/detail/helpers.hpp>
+#include <CL/sycl/detail/kernel_desc.hpp>
+#include <CL/sycl/detail/scheduler/requirements.h>
+#include <CL/sycl/event.hpp>
+#include <CL/sycl/exception.hpp>
+#include <CL/sycl/nd_range.hpp>
+#include <CL/sycl/stl.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+class queue_impl;
+}
+namespace simple_scheduler {
+using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
+using EventImplPtr = std::shared_ptr<cl::sycl::detail::event_impl>;
+namespace csd = cl::sycl::detail;
+
+class Command {
+public:
+  enum CommandType { RUN_KERNEL, MOVE_MEMORY, ALLOCA, COPY, FILL };
+
+  Command(CommandType Type, QueueImplPtr Queue);
+
+  CommandType getType() const { return m_Type; }
+
+  size_t getID() const { return m_ID; }
+
+  void addDep(std::shared_ptr<Command> Dep, BufferReqPtr Buf) {
+    m_Deps.emplace_back(std::move(Dep), std::move(Buf));
+  }
+
+  void addInteropArg(InteropArg Arg) { m_InteropArgs.push_back(Arg); }
+
+  cl::sycl::event enqueue(std::vector<cl::sycl::event> DepEvents) {
+    bool Expected = false;
+    if (m_Enqueued.compare_exchange_strong(Expected, true)) {
+      enqueueImp(std::move(DepEvents), detail::getSyclObjImpl(m_Event));
+    }
+    return m_Event;
+  }
+
+  bool isEnqueued() const { return m_Enqueued; }
+
+  virtual void dump() const = 0;
+
+  virtual void print(std::ostream &Stream) const = 0;
+
+  virtual void printDot(std::ostream &Stream) const = 0;
+
+  QueueImplPtr getQueue() const { return m_Queue; }
+
+  cl::sycl::event getEvent() const { return m_Event; }
+
+  std::shared_ptr<Command> getDepCommandForReqBuf(const BufferReqPtr &Buf) {
+    for (const auto &Dep : m_Deps) {
+      if (Dep.second->isSame(Buf)) {
+        return Dep.first;
+      }
+    }
+    return nullptr;
+  }
+
+  cl::sycl::access::mode getAccessModeForReqBuf(const BufferReqPtr &Buf) const {
+    for (const auto &Dep : m_Deps) {
+      if (Dep.second->isSame(Buf)) {
+        return Dep.second->getAccessModeType();
+      }
+    }
+    throw cl::sycl::runtime_error("Buffer not found.");
+  }
+
+  void replaceDepCommandForReqBuf(const BufferReqPtr &Buf,
+                                  std::shared_ptr<Command> NewCommand) {
+    for (auto &Dep : m_Deps) {
+      if (Dep.second->isSame(Buf)) {
+        Dep.first = std::move(NewCommand);
+        return;
+      }
+    }
+    throw cl::sycl::runtime_error("Buffer not found.");
+  }
+
+  std::vector<std::pair<std::shared_ptr<Command>, BufferReqPtr>>
+  getDependencies() {
+    return m_Deps;
+  }
+
+  void removeAllDeps() { m_Deps.clear(); }
+
+  virtual ~Command() = default;
+
+private:
+  virtual void enqueueImp(std::vector<cl::sycl::event> DepEvents,
+                          EventImplPtr Event) = 0;
+
+  CommandType m_Type;
+  size_t m_ID;
+  cl::sycl::event m_Event;
+  std::atomic<bool> m_Enqueued;
+
+protected:
+  QueueImplPtr m_Queue;
+  std::vector<std::pair<std::shared_ptr<Command>, BufferReqPtr>> m_Deps;
+  std::vector<InteropArg> m_InteropArgs;
+};
+
+using CommandPtr = std::shared_ptr<Command>;
+
+class MemMoveCommand : public Command {
+public:
+  MemMoveCommand(BufferReqPtr Buf, QueueImplPtr SrcQueue, QueueImplPtr DstQueue,
+                 cl::sycl::access::mode mode)
+      : Command(Command::MOVE_MEMORY, std::move(DstQueue)),
+        m_Buf(std::move(Buf)), m_AccessMode(mode),
+        m_SrcQueue(std::move(SrcQueue)) {}
+
+  access::mode getAccessModeType() const { return m_Buf->getAccessModeType(); }
+  void printDot(std::ostream &Stream) const override;
+  void print(std::ostream &Stream) const override;
+  void dump() const override { print(std::cout); }
+
+private:
+  void enqueueImp(std::vector<cl::sycl::event> DepEvents,
+                  EventImplPtr Event) override;
+  BufferReqPtr m_Buf = nullptr;
+  cl::sycl::access::mode m_AccessMode;
+  QueueImplPtr m_SrcQueue;
+};
+
+class AllocaCommand : public Command {
+public:
+  AllocaCommand(BufferReqPtr Buf, QueueImplPtr Queue,
+                cl::sycl::access::mode mode)
+      : Command(Command::ALLOCA, std::move(Queue)), m_Buf(std::move(Buf)),
+        m_AccessMode(mode) {}
+
+  access::mode getAccessModeType() const { return m_Buf->getAccessModeType(); }
+  void printDot(std::ostream &Stream) const override;
+  void print(std::ostream &Stream) const override;
+  void dump() const override { print(std::cout); }
+
+private:
+  void enqueueImp(std::vector<cl::sycl::event> DepEvents,
+                  EventImplPtr Event) override;
+  BufferReqPtr m_Buf = nullptr;
+  cl::sycl::access::mode m_AccessMode;
+};
+
+template <typename KernelType, int Dimensions, typename RangeType,
+          typename KernelArgType, bool SingleTask = false>
+class ExecuteKernelCommand : public Command {
+public:
+  ExecuteKernelCommand(KernelType &HostKernel, const std::string KernelName,
+                       const unsigned int KernelArgsNum,
+                       const detail::kernel_param_desc_t *KernelArgs,
+                       RangeType workItemsRange, QueueImplPtr Queue,
+                       cl_kernel ClKernel, id<Dimensions> workItemOffset = {})
+      : Command(Command::RUN_KERNEL, std::move(Queue)),
+        m_KernelName(KernelName), m_KernelArgsNum(KernelArgsNum),
+        m_KernelArgs(KernelArgs), m_WorkItemsRange(workItemsRange),
+        m_WorkItemsOffset(workItemOffset), m_HostKernel(HostKernel),
+        m_ClKernel(ClKernel) {}
+
+  void printDot(std::ostream &Stream) const override;
+  void print(std::ostream &Stream) const override;
+  void dump() const override { print(std::cout); }
+
+private:
+  cl_kernel createKernel(const std::string &KernelName,
+                         cl_program Program) const;
+
+  template <bool STask = SingleTask, int Dims = Dimensions,
+            typename = typename std::enable_if<STask == true>::type>
+  void runOnHost() {
+    m_HostKernel();
+  }
+
+  template <bool STask = SingleTask, int Dims = Dimensions>
+  typename std::enable_if<
+      (STask == false) && (Dims > 0 && Dims < 4) &&
+          std::is_same<RangeType, range<Dimensions>>::value &&
+          std::is_same<KernelArgType, id<Dimensions>>::value,
+      void>::type
+  runOnHost() {
+    const size_t ZMax = (Dims > 2) ? m_WorkItemsRange[2] : 1;
+    const size_t YMax = (Dims > 1) ? m_WorkItemsRange[1] : 1;
+    size_t XYZ[3];
+    for (XYZ[2] = 0; XYZ[2] < ZMax; ++XYZ[2]) {
+      for (XYZ[1] = 0; XYZ[1] < YMax; ++XYZ[1]) {
+        for (XYZ[0] = 0; XYZ[0] < m_WorkItemsRange[0]; ++XYZ[0]) {
+          id<Dims> ID;
+          for (int I = 0; I < Dims; ++I) {
+            ID[I] = XYZ[I];
+          }
+          m_HostKernel(ID);
+        }
+      }
+    }
+  }
+
+  template <bool STask = SingleTask, int Dims = Dimensions>
+  typename std::enable_if<
+      (STask == false) && (Dims > 0 && Dims < 4) &&
+          std::is_same<RangeType, range<Dimensions>>::value &&
+          (std::is_same<KernelArgType, item<Dimensions, false>>::value ||
+           std::is_same<KernelArgType, item<Dimensions, true>>::value),
+      void>::type
+  runOnHost() {
+    const size_t ZMax = (Dims > 2) ? m_WorkItemsRange[2] : 1;
+    const size_t YMax = (Dims > 1) ? m_WorkItemsRange[1] : 1;
+    size_t XYZ[3];
+    for (XYZ[2] = 0; XYZ[2] < ZMax; ++XYZ[2]) {
+      for (XYZ[1] = 0; XYZ[1] < YMax; ++XYZ[1]) {
+        for (XYZ[0] = 0; XYZ[0] < m_WorkItemsRange[0]; ++XYZ[0]) {
+          id<Dims> ID;
+          range<Dims> Range;
+          for (int I = 0; I < Dims; ++I) {
+            ID[I] = XYZ[I];
+            Range[I] = m_WorkItemsRange[I];
+          }
+          item<Dims, false> Item =
+              detail::Builder::createItem<Dims, false>(Range, ID);
+          m_HostKernel(Item);
+        }
+      }
+    }
+  }
+
+  template <bool STask = SingleTask, int Dims = Dimensions>
+  typename std::enable_if<
+      (STask == false) && (Dims > 0 && Dims < 4) &&
+          std::is_same<RangeType, nd_range<Dimensions>>::value,
+      void>::type
+  runOnHost() {
+    // TODO add offset logic
+
+    const id<3> GlobalSize{
+        m_WorkItemsRange.get_global_range()[0],
+        ((Dims > 1) ? m_WorkItemsRange.get_global_range()[1] : 1),
+        ((Dims > 2) ? m_WorkItemsRange.get_global_range()[2] : 1)};
+    const id<3> LocalSize{
+        m_WorkItemsRange.get_local_range()[0],
+        ((Dims > 1) ? m_WorkItemsRange.get_local_range()[1] : 1),
+        ((Dims > 2) ? m_WorkItemsRange.get_local_range()[2] : 1)};
+    id<3> GroupSize;
+    for (int I = 0; I < 3; ++I) {
+      GroupSize[I] = GlobalSize[I] / LocalSize[I];
+    }
+
+    size_t GlobalXYZ[3];
+    for (GlobalXYZ[2] = 0; GlobalXYZ[2] < GroupSize[2]; ++GlobalXYZ[2]) {
+      for (GlobalXYZ[1] = 0; GlobalXYZ[1] < GroupSize[1]; ++GlobalXYZ[1]) {
+        for (GlobalXYZ[0] = 0; GlobalXYZ[0] < GroupSize[0]; ++GlobalXYZ[0]) {
+          id<Dims> ID;
+          for (int I = 0; I < Dims; ++I) {
+            ID[I] = GlobalXYZ[I];
+          }
+          group<Dims> Group = detail::Builder::createGroup<Dims>(
+              m_WorkItemsRange.get_global_range(),
+              m_WorkItemsRange.get_local_range(), ID);
+          size_t LocalXYZ[3];
+          for (LocalXYZ[2] = 0; LocalXYZ[2] < LocalSize[2]; ++LocalXYZ[2]) {
+            for (LocalXYZ[1] = 0; LocalXYZ[1] < LocalSize[1]; ++LocalXYZ[1]) {
+              for (LocalXYZ[0] = 0; LocalXYZ[0] < LocalSize[0]; ++LocalXYZ[0]) {
+                id<Dims> GlobalID;
+                id<Dims> LocalID;
+                for (int I = 0; I < Dims; ++I) {
+                  GlobalID[I] = GlobalXYZ[I] * LocalSize[I] + LocalXYZ[I];
+                  LocalID[I] = LocalXYZ[I];
+                }
+                const item<Dims, true> GlobalItem =
+                    detail::Builder::createItem<Dims, true>(
+                        m_WorkItemsRange.get_global_range(), GlobalID,
+                        m_WorkItemsRange.get_offset());
+                const item<Dims, false> LocalItem =
+                    detail::Builder::createItem<Dims, false>(
+                        m_WorkItemsRange.get_local_range(), LocalID);
+                nd_item<Dims> NDItem = detail::Builder::createNDItem<Dims>(
+                    GlobalItem, LocalItem, Group);
+                m_HostKernel(NDItem);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void executeKernel(std::vector<cl::sycl::event> DepEvents,
+                     EventImplPtr Event);
+
+  void enqueueImp(std::vector<cl::sycl::event> DepEvents,
+                  EventImplPtr Event) override {
+    executeKernel(std::move(DepEvents), std::move(Event));
+  }
+
+  template <typename R = RangeType>
+  typename std::enable_if<std::is_same<R, range<Dimensions>>::value,
+                          cl_event>::type
+  runEnqueueNDRangeKernel(cl_command_queue &EnvQueue, cl_kernel &Kernel,
+                          std::vector<cl_event> CLEvents);
+
+  template <typename R = RangeType>
+  typename std::enable_if<std::is_same<R, nd_range<Dimensions>>::value,
+                          cl_event>::type
+  runEnqueueNDRangeKernel(cl_command_queue &EnvQueue, cl_kernel &Kernel,
+                          std::vector<cl_event> CLEvents);
+
+  std::string m_KernelName;
+  const unsigned int m_KernelArgsNum;
+  const detail::kernel_param_desc_t *m_KernelArgs;
+  RangeType m_WorkItemsRange;
+  id<Dimensions> m_WorkItemsOffset;
+  KernelType m_HostKernel;
+  cl_kernel m_ClKernel;
+};
+
+template <typename T, int Dim> class FillCommand : public Command {
+public:
+  FillCommand(BufferReqPtr Buf, T Pattern, QueueImplPtr Queue, range<Dim> Range,
+              id<Dim> Offset)
+      : Command(Command::FILL, std::move(Queue)), m_Buf(std::move(Buf)),
+        m_Pattern(std::move(Pattern)), m_Offset(std::move(Offset)),
+        m_Range(std::move(Range)) {}
+
+  access::mode getAccessModeType() const { return m_Buf->getAccessModeType(); }
+  void printDot(std::ostream &Stream) const override;
+  void print(std::ostream &Stream) const override;
+  void dump() const override { print(std::cout); }
+
+private:
+  void enqueueImp(std::vector<cl::sycl::event> DepEvents, EventImplPtr Event) {
+    assert(nullptr != m_Buf && "Buf is nullptr");
+    m_Buf->fill(m_Queue, std::move(DepEvents), std::move(Event), &m_Pattern,
+                sizeof(T), Dim, &m_Offset[0], &m_Range[0]);
+  }
+  BufferReqPtr m_Buf = nullptr;
+  T m_Pattern;
+  id<Dim> m_Offset;
+  range<Dim> m_Range;
+};
+
+template <int DimSrc, int DimDest> class CopyCommand : public Command {
+public:
+  CopyCommand(BufferReqPtr BufSrc, BufferReqPtr BufDest, QueueImplPtr Queue,
+              range<DimSrc> SrcRange, id<DimSrc> SrcOffset,
+              id<DimDest> DestOffset, size_t SizeTySrc, size_t SizeSrc,
+              range<DimSrc> BuffSrcRange)
+      : Command(Command::COPY, std::move(Queue)), m_BufSrc(std::move(BufSrc)),
+        m_BufDest(std::move(BufDest)), m_SrcRange(std::move(SrcRange)),
+        m_SrcOffset(std::move(SrcOffset)), m_DestOffset(std::move(DestOffset)),
+        m_SizeTySrc(SizeTySrc), m_SizeSrc(SizeSrc),
+        m_BuffSrcRange(BuffSrcRange) {}
+
+  access::mode getAccessModeType() const {
+    return m_BufDest->getAccessModeType();
+  }
+  void printDot(std::ostream &Stream) const override;
+  void print(std::ostream &Stream) const override;
+  void dump() const override { print(std::cout); }
+
+private:
+  void enqueueImp(std::vector<cl::sycl::event> DepEvents, EventImplPtr Event) {
+    assert(nullptr != m_BufSrc && "m_BufSrc is nullptr");
+    assert(nullptr != m_BufDest && "m_BufDest is nullptr");
+    m_BufDest->copy(m_Queue, std::move(DepEvents), std::move(Event), m_BufSrc,
+                    DimSrc, &m_SrcRange[0], &m_SrcOffset[0], &m_DestOffset[0],
+                    m_SizeTySrc, m_SizeSrc, &m_BuffSrcRange[0]);
+  }
+  BufferReqPtr m_BufSrc = nullptr;
+  BufferReqPtr m_BufDest = nullptr;
+  range<DimSrc> m_SrcRange;
+  id<DimSrc> m_SrcOffset;
+  id<DimDest> m_DestOffset;
+  size_t m_SizeTySrc;
+  size_t m_SizeSrc;
+  range<DimSrc> m_BuffSrcRange;
+};
+
+} // namespace simple_scheduler
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/scheduler/printers.cpp b/sycl/include/CL/sycl/detail/scheduler/printers.cpp
new file mode 100644
index 000000000000..660f470a8365
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/scheduler/printers.cpp
@@ -0,0 +1,202 @@
+//==----------- printers.cpp -----------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/buffer.hpp>
+#include <CL/sycl/detail/scheduler/commands.h>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/queue.hpp>
+
+#include <ostream>
+
+namespace cl {
+namespace sycl {
+namespace simple_scheduler {
+
+static std::string accessMode2String(cl::sycl::access::mode Type) {
+  switch (Type) {
+  case access::mode::write:
+    return "write";
+  case access::mode::read:
+    return "read";
+  case access::mode::read_write:
+    return "read_write";
+  default:
+    return "unhandled";
+  }
+}
+
+static std::string
+getDeviceTypeString(const cl::sycl::device &Device,
+                    access::target Target = access::target::global_buffer) {
+  if (access::target::host_buffer == Target) {
+    return "User host.";
+  }
+  if (Device.is_cpu()) {
+    return "CPU";
+  }
+  if (Device.is_gpu()) {
+    return "GPU";
+  }
+  if (Device.is_accelerator()) {
+    return "ACC";
+  }
+  if (Device.is_host()) {
+    return "HOST";
+  }
+  return "";
+}
+
+static std::string
+getColor(const cl::sycl::device &Device,
+         access::target Target = access::target::global_buffer) {
+  if (access::target::host_buffer == Target) {
+    return "#FFDEAD"; // navajowhite1
+  }
+  if (Device.is_cpu()) {
+    return "#00BFFF"; // deepskyblue1
+  }
+  if (Device.is_gpu()) {
+    return "#00FF7F"; // green
+  }
+  if (Device.is_accelerator()) {
+    return "#FF0000"; // red
+  }
+  if (Device.is_host()) {
+    return "#FFBBFF"; // plum1
+  }
+  return "";
+}
+
+template <typename KernelType, int Dimensions, typename RangeType,
+          typename KernelArgType, bool SingleTask>
+void ExecuteKernelCommand<KernelType, Dimensions, RangeType, KernelArgType,
+                          SingleTask>::printDot(std::ostream &Stream) const {
+  const std::string CommandColor = getColor(m_Queue->get_device());
+
+  Stream << "\"" << this << "\" [style=filled, label=\"";
+
+  Stream << "ID = " << getID() << " ; ";
+  Stream << "RUN_KERNEL "
+         << "\\n"
+         << m_KernelName << " ON ";
+  Stream << getDeviceTypeString(m_Queue->get_device()) << "\\n";
+
+  Stream << "\", fillcolor=\"" << CommandColor << "\"];" << std::endl;
+
+  for (const auto &Dep : m_Deps) {
+    const auto &Buf = Dep.second;
+    Stream << "  \"" << this << "\" -> \"" << Dep.first << "\" [ label=\"";
+    Stream << accessMode2String(Buf->getAccessModeType()) << "\" ];";
+    Stream << std::endl;
+  }
+}
+
+template <typename KernelType, int Dimensions, typename RangeType,
+          typename KernelArgType, bool SingleTask>
+void ExecuteKernelCommand<KernelType, Dimensions, RangeType, KernelArgType,
+                          SingleTask>::print(std::ostream &Stream) const {
+  Stream << "ID = " << getID() << " ; ";
+  Stream << "RUN_KERNEL " << m_KernelName << " ON ";
+  Stream << getDeviceTypeString(m_Queue->get_device()) << std::endl;
+  Stream << "    Dependency:" << std::endl;
+
+  for (const auto &Dep : m_Deps) {
+    const auto &Command = Dep.first;
+    const auto &Buf = Dep.second;
+    Stream << "        Dep on buf " << Buf->getUniqID() << " ";
+    Stream << accessMode2String(Buf->getAccessModeType());
+    Stream << " from Command ID = " << Command->getID() << std::endl;
+  }
+}
+
+template <typename T, int Dim>
+void FillCommand<T, Dim>::printDot(std::ostream &Stream) const {
+  const std::string CommandColor = getColor(m_Queue->get_device());
+
+  Stream << "\"" << this << "\" [style=filled, label=\"";
+
+  Stream << "ID = " << getID() << " ; ";
+  Stream << "Fill "
+         << "\\n"
+         << " Buf : " << m_Buf->getUniqID() << " ON ";
+  Stream << getDeviceTypeString(m_Queue->get_device()) << "\\n";
+
+  Stream << "\", fillcolor=\"" << CommandColor << "\"];" << std::endl;
+
+  for (const auto &Dep : m_Deps) {
+    const auto &Buf = Dep.second;
+    Stream << "  \"" << this << "\" -> \"" << Dep.first << "\" [ label=\"";
+    Stream << accessMode2String(Buf->getAccessModeType()) << "\" ];";
+    Stream << std::endl;
+  }
+}
+
+template <typename T, int Dim>
+void FillCommand<T, Dim>::print(std::ostream &Stream) const {
+  Stream << "ID = " << getID() << " ; ";
+  Stream << "Fill "
+         << " Buf : " << m_Buf->getUniqID() << " ON ";
+  Stream << getDeviceTypeString(m_Queue->get_device()) << std::endl;
+  Stream << "    Dependency:" << std::endl;
+
+  for (const auto &Dep : m_Deps) {
+    const auto &Command = Dep.first;
+    const auto &Buf = Dep.second;
+    Stream << "        Dep on buf " << Buf->getUniqID() << " ";
+    Stream << accessMode2String(Buf->getAccessModeType());
+    Stream << " from Command ID = " << Command->getID() << std::endl;
+  }
+}
+
+template <int DimSrc, int DimDest>
+void CopyCommand<DimSrc, DimDest>::printDot(std::ostream &Stream) const {
+  const std::string CommandColor = getColor(m_Queue->get_device());
+
+  Stream << "\"" << this << "\" [style=filled, label=\"";
+
+  Stream << "ID = " << getID() << " ; ";
+  Stream << "Copy "
+         << "\\n"
+         << " Buf : " << m_BufSrc->getUniqID() << " ON ";
+  Stream << getDeviceTypeString(m_Queue->get_device()) << "\\n";
+  Stream << " To Buf : " << m_BufDest->getUniqID();
+
+  Stream << "\", fillcolor=\"" << CommandColor << "\"];" << std::endl;
+
+  for (const auto &Dep : m_Deps) {
+    const auto &Buf = Dep.second;
+    Stream << "  \"" << this << "\" -> \"" << Dep.first << "\" [ label=\"";
+    Stream << accessMode2String(Buf->getAccessModeType()) << "\" ];";
+    Stream << std::endl;
+  }
+}
+
+template <int DimSrc, int DimDest>
+void CopyCommand<DimSrc, DimDest>::print(std::ostream &Stream) const {
+  Stream << "ID = " << getID() << " ; ";
+  Stream << "Copy "
+         << " Buf : " << m_BufSrc->getUniqID() << " ON ";
+  Stream << getDeviceTypeString(m_Queue->get_device()) << std::endl;
+  Stream << " Buf : " << m_BufDest->getUniqID();
+  Stream << "    Dependency:" << std::endl;
+
+  for (const auto &Dep : m_Deps) {
+    const auto &Command = Dep.first;
+    const auto &Buf = Dep.second;
+    Stream << "        Dep on buf " << Buf->getUniqID() << " ";
+    Stream << accessMode2String(Buf->getAccessModeType());
+    Stream << " from Command ID = " << Command->getID() << std::endl;
+  }
+}
+
+} // namespace simple_scheduler
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/scheduler/requirements.h b/sycl/include/CL/sycl/detail/scheduler/requirements.h
new file mode 100644
index 000000000000..7e36271b8220
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/scheduler/requirements.h
@@ -0,0 +1,169 @@
+//==----------- requirements.h ---------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <vector>
+
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/event.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+template <typename T, int Dimensions, typename AllocatorT> class buffer_impl;
+} // namespace detail
+
+namespace detail {
+class queue_impl;
+class event_impl;
+} // namespace detail
+namespace simple_scheduler {
+
+using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
+using EventImplPtr = std::shared_ptr<cl::sycl::detail::event_impl>;
+
+class BufferRequirement;
+using BufferReqPtr = std::shared_ptr<BufferRequirement>;
+
+class BufferRequirement {
+public:
+  BufferRequirement(void *UniqID, access::mode AccessMode,
+                    access::target TargetType)
+      : m_UniqID(UniqID), m_AccessMode(AccessMode), m_TargetType(TargetType) {}
+
+  virtual ~BufferRequirement() = default;
+
+  bool isBigger(const std::shared_ptr<BufferRequirement> &RHS) const {
+    return m_UniqID > RHS->m_UniqID;
+  }
+
+  bool isSame(const std::shared_ptr<BufferRequirement> &RHS) const {
+    return m_UniqID == RHS->m_UniqID;
+  }
+
+  void *getUniqID() const { return m_UniqID; }
+
+  access::mode getAccessModeType() const { return m_AccessMode; }
+
+  virtual cl_mem getCLMemObject() = 0;
+
+  virtual void allocate(QueueImplPtr Queue,
+                        std::vector<cl::sycl::event> DepEvents,
+                        EventImplPtr Event) = 0;
+
+  virtual void moveMemoryTo(QueueImplPtr Queue,
+                            std::vector<cl::sycl::event> DepEvents,
+                            EventImplPtr Event) = 0;
+
+  virtual void fill(QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+                    EventImplPtr Event, void *Pattern, size_t PatternSize,
+                    int Dim, size_t *Offset, size_t *Range) = 0;
+
+  virtual void copy(QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+                    EventImplPtr Event, BufferReqPtr SrcReq, const int DimSrc,
+                    const size_t *const SrcRange, const size_t *const SrcOffset,
+                    const size_t *const DestOffset, const size_t SizeTySrc,
+                    const size_t SizeSrc, const size_t *const BuffSrcRange) = 0;
+
+  access::target getTargetType() const { return m_TargetType; }
+
+  void addAccessMode(const access::mode AccessMode) {
+    if (access::mode::read == m_AccessMode &&
+        access::mode::read != AccessMode) {
+      m_AccessMode = access::mode::read_write;
+    } else if (access::mode::write == m_AccessMode &&
+               (AccessMode != access::mode::write &&
+                AccessMode != access::mode::discard_write)) {
+      m_AccessMode = access::mode::read_write;
+    }
+  }
+
+protected:
+  void *m_UniqID;
+  access::mode m_AccessMode;
+  access::target m_TargetType;
+};
+
+template <typename T, int Dimensions, typename AllocatorT, access::mode Mode,
+          access::target Target>
+class BufferStorage : public BufferRequirement {
+public:
+  BufferStorage(
+      typename cl::sycl::detail::buffer_impl<T, Dimensions, AllocatorT> &Buffer)
+      : BufferRequirement(&Buffer, Mode, Target), m_Buffer(&Buffer) {}
+
+  void allocate(QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+                EventImplPtr Event) override {
+    assert(m_Buffer != nullptr && "BufferStorage::m_Buffer is nullptr");
+    m_Buffer->allocate(std::move(Queue), std::move(DepEvents), std::move(Event),
+                       Mode);
+  }
+
+  void moveMemoryTo(QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+                    EventImplPtr Event) override {
+    assert(m_Buffer != nullptr && "BufferStorage::m_Buffer is nullptr");
+    m_Buffer->moveMemoryTo(std::move(Queue), std::move(DepEvents),
+                           std::move(Event));
+  }
+
+  void fill(QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+            EventImplPtr Event, void *Pattern, size_t PatternSize, int Dim,
+            size_t *Offset, size_t *Range) override {
+    assert(m_Buffer != nullptr && "BufferStorage::m_Buffer is nullptr");
+    m_Buffer->fill(std::move(Queue), std::move(DepEvents), std::move(Event),
+                   std::move(Pattern), PatternSize, Dim, Offset, Range);
+  }
+
+  void copy(QueueImplPtr Queue, std::vector<cl::sycl::event> DepEvents,
+            EventImplPtr Event, BufferReqPtr SrcReq, const int DimSrc,
+            const size_t *const SrcRange, const size_t *const SrcOffset,
+            const size_t *const DestOffset, const size_t SizeTySrc,
+            const size_t SizeSrc, const size_t *const BuffSrcRange) override {
+    assert(m_Buffer != nullptr && "BufferStorage::m_Buffer is nullptr");
+    assert(SrcReq != nullptr && "BufferStorage::SrcReq is nullptr");
+
+    m_Buffer->copy(std::move(Queue), std::move(DepEvents), std::move(Event),
+                   std::move(SrcReq), DimSrc, SrcRange, SrcOffset, DestOffset,
+                   SizeTySrc, SizeSrc, BuffSrcRange);
+  }
+
+  cl_mem getCLMemObject() override {
+    assert(m_Buffer != nullptr && "BufferStorage::m_Buffer is nullptr");
+    return m_Buffer->getOpenCLMem();
+  }
+
+private:
+  cl::sycl::detail::buffer_impl<T, Dimensions, AllocatorT> *m_Buffer = nullptr;
+};
+
+struct classcomp {
+  bool operator()(const BufferReqPtr &LHS, const BufferReqPtr &RHS) const {
+    return LHS->isBigger(RHS);
+  }
+};
+
+// Represents a call of set_arg made in the SYCL application
+struct InteropArg {
+  shared_ptr_class<void> m_Ptr;
+  size_t m_Size;
+  int m_ArgIndex;
+  BufferReqPtr m_BufReq;
+
+  InteropArg(shared_ptr_class<void> Ptr, size_t Size, int ArgIndex,
+             BufferReqPtr BufReq)
+      : m_Ptr(Ptr), m_Size(Size), m_ArgIndex(ArgIndex), m_BufReq(BufReq) {}
+};
+
+} // namespace simple_scheduler
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/scheduler/scheduler.cpp b/sycl/include/CL/sycl/detail/scheduler/scheduler.cpp
new file mode 100644
index 000000000000..dd12594823d1
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/scheduler/scheduler.cpp
@@ -0,0 +1,303 @@
+//==----------- scheduler.cpp ----------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/accessor.hpp>
+#include <CL/sycl/buffer.hpp>
+#include <CL/sycl/detail/scheduler/commands.h>
+#include <CL/sycl/detail/scheduler/requirements.h>
+#include <CL/sycl/detail/scheduler/scheduler.h>
+#include <CL/sycl/event.hpp>
+#include <CL/sycl/nd_range.hpp>
+
+#include <cassert>
+#include <fstream>
+#include <set>
+#include <vector>
+
+namespace cl {
+namespace sycl {
+namespace simple_scheduler {
+
+template <typename T, int Dimensions, typename AllocatorT>
+static BufferReqPtr
+getReqForBuffer(const std::set<BufferReqPtr, classcomp> &BufReqs,
+                const detail::buffer_impl<T, Dimensions, AllocatorT> &Buf) {
+  for (const auto &Req : BufReqs) {
+    if (Req->getUniqID() == &Buf) {
+      return Req;
+    }
+  }
+  return nullptr;
+}
+
+// Adds a buffer requirement for this node.
+template <access::mode Mode, access::target Target, typename T, int Dimensions,
+          typename AllocatorT>
+void Node::addBufRequirement(
+    detail::buffer_impl<T, Dimensions, AllocatorT> &Buf) {
+  BufferReqPtr Req = getReqForBuffer(m_Bufs, Buf);
+
+  // Check if there is requirement for the same buffer already.
+  if (nullptr != Req) {
+    Req->addAccessMode(Mode);
+  } else {
+    BufferReqPtr BufStor = std::make_shared<
+        BufferStorage<T, Dimensions, AllocatorT, Mode, Target>>(Buf);
+    m_Bufs.insert(BufStor);
+  }
+}
+
+// Adds an accessor requirement for this node.
+template <typename dataT, int dimensions, access::mode accessMode,
+          access::target accessTarget, access::placeholder isPlaceholder>
+void Node::addAccRequirement(
+    accessor<dataT, dimensions, accessMode, accessTarget, isPlaceholder> &&Acc,
+    int argIndex) {
+  detail::buffer_impl<dataT, dimensions> *buf =
+      Acc.template accessor_base<dataT, dimensions, accessMode, accessTarget,
+                                 isPlaceholder>::__impl()
+          ->m_Buf;
+  addBufRequirement<accessMode, accessTarget, dataT, dimensions>(*buf);
+  addInteropArg(nullptr, buf->get_size(), argIndex,
+                getReqForBuffer(m_Bufs, *buf));
+}
+
+// Adds a kernel to this node, maps to single task.
+template <typename KernelType>
+void Node::addKernel(const std::string &KernelName, const int KernelArgsNum,
+                     const detail::kernel_param_desc_t *KernelArgs,
+                     KernelType KernelFunc, cl_kernel ClKernel) {
+  assert(!m_Kernel && "This node already contains an execution command");
+  m_Kernel =
+      std::make_shared<ExecuteKernelCommand<KernelType,
+                                            /*Dimensions=*/1, range<1>, id<1>,
+                                            /*SingleTask=*/true>>(
+          KernelFunc, KernelName, KernelArgsNum, KernelArgs, range<1>(1),
+          m_Queue, ClKernel);
+}
+
+// Adds kernel to this node, maps on range parallel for.
+template <typename KernelType, int Dimensions, typename KernelArgType>
+void Node::addKernel(const std::string &KernelName, const int KernelArgsNum,
+                     const detail::kernel_param_desc_t *KernelArgs,
+                     KernelType KernelFunc, range<Dimensions> NumWorkItems,
+                     cl_kernel ClKernel) {
+  assert(!m_Kernel && "This node already contains an execution command");
+  m_Kernel =
+      std::make_shared<ExecuteKernelCommand<KernelType, Dimensions,
+                                            range<Dimensions>, KernelArgType>>(
+          KernelFunc, KernelName, KernelArgsNum, KernelArgs, NumWorkItems,
+          m_Queue, ClKernel);
+}
+
+// Adds kernel to this node, maps to range parallel for with offset.
+template <typename KernelType, int Dimensions, typename KernelArgType>
+void Node::addKernel(const std::string &KernelName, const int KernelArgsNum,
+                     const detail::kernel_param_desc_t *KernelArgs,
+                     KernelType KernelFunc, range<Dimensions> NumWorkItems,
+                     id<Dimensions> WorkItemOffset, cl_kernel ClKernel) {
+  assert(!m_Kernel && "This node already contains an execution command");
+  m_Kernel =
+      std::make_shared<ExecuteKernelCommand<KernelType, Dimensions,
+                                            range<Dimensions>, KernelArgType>>(
+          KernelFunc, KernelName, KernelArgsNum, KernelArgs, NumWorkItems,
+          m_Queue, ClKernel, WorkItemOffset);
+}
+// Adds kernel to this node, maps on nd_range parallel for.
+template <typename KernelType, int Dimensions>
+void Node::addKernel(const std::string &KernelName, const int KernelArgsNum,
+                     const detail::kernel_param_desc_t *KernelArgs,
+                     KernelType KernelFunc, nd_range<Dimensions> ExecutionRange,
+                     cl_kernel ClKernel) {
+  assert(!m_Kernel && "This node already contains an execution command");
+  m_Kernel = std::make_shared<ExecuteKernelCommand<
+      KernelType, Dimensions, nd_range<Dimensions>, nd_item<Dimensions>>>(
+      KernelFunc, KernelName, KernelArgsNum, KernelArgs, ExecutionRange,
+      m_Queue, ClKernel);
+}
+
+// Adds explicit memory operation to this node, maps on handler fill method
+template <typename T, int Dimensions, access::mode mode, access::target tgt,
+          access::placeholder isPlaceholder>
+void Node::addExplicitMemOp(
+    accessor<T, Dimensions, mode, tgt, isPlaceholder> &Dest, T Src) {
+  auto *DestBase = Dest.template accessor_base<T, Dimensions, mode, tgt,
+                                               isPlaceholder>::__impl();
+  assert(DestBase != nullptr &&
+         "Accessor should have an initialized accessor_base");
+  detail::buffer_impl<T, Dimensions> *Buf = DestBase->m_Buf;
+
+  range<Dimensions> Range = DestBase->Range;
+  id<Dimensions> Offset = DestBase->Offset;
+
+  BufferReqPtr Req = getReqForBuffer(m_Bufs, *Buf);
+  assert(Buf != nullptr && "Accessor should have an initialized buffer_impl");
+  assert(!m_Kernel && "This node already contains an execution command");
+  m_Kernel = std::make_shared<FillCommand<T, Dimensions>>(Req, Src, m_Queue,
+                                                          Range, Offset);
+}
+
+// Adds explicit memory operation to this node, maps on handler copy method
+template <typename T_src, int dim_src, access::mode mode_src,
+          access::target tgt_src, typename T_dest, int dim_dest,
+          access::mode mode_dest, access::target tgt_dest,
+          access::placeholder isPlaceholder_src,
+          access::placeholder isPlaceholder_dest>
+void Node::addExplicitMemOp(
+    accessor<T_src, dim_src, mode_src, tgt_src, isPlaceholder_src> Src,
+    accessor<T_dest, dim_dest, mode_dest, tgt_dest, isPlaceholder_dest> Dest) {
+  auto *SrcBase = Src.template accessor_base<T_src, dim_src, mode_src, tgt_src,
+                                             isPlaceholder_src>::__impl();
+  assert(SrcBase != nullptr &&
+         "Accessor should have an initialized accessor_base");
+  auto *DestBase =
+      Dest.template accessor_base<T_dest, dim_dest, mode_dest, tgt_dest,
+                                  isPlaceholder_dest>::__impl();
+  assert(DestBase != nullptr &&
+         "Accessor should have an initialized accessor_base");
+
+  detail::buffer_impl<T_src, dim_src> *SrcBuf = SrcBase->m_Buf;
+  assert(SrcBuf != nullptr &&
+         "Accessor should have an initialized buffer_impl");
+  detail::buffer_impl<T_dest, dim_dest> *DestBuf = DestBase->m_Buf;
+  assert(DestBuf != nullptr &&
+         "Accessor should have an initialized buffer_impl");
+
+  range<dim_src> SrcRange = SrcBase->Range;
+  id<dim_src> SrcOffset = SrcBase->Offset;
+  id<dim_dest> DestOffset = DestBase->Offset;
+
+  range<dim_src> BuffSrcRange = SrcBase->m_Buf->get_range();
+
+  BufferReqPtr SrcReq = getReqForBuffer(m_Bufs, *SrcBuf);
+  BufferReqPtr DestReq = getReqForBuffer(m_Bufs, *DestBuf);
+
+  assert(!m_Kernel && "This node already contains an execution command");
+  m_Kernel = std::make_shared<CopyCommand<dim_src, dim_dest>>(
+      SrcReq, DestReq, m_Queue, SrcRange, SrcOffset, DestOffset, sizeof(T_src),
+      SrcBase->get_count(), BuffSrcRange);
+}
+
+// Updates host data of the specified accessor
+template <typename T, int Dimensions, access::mode mode, access::target tgt,
+          access::placeholder isPlaceholder>
+void Scheduler::updateHost(
+    accessor<T, Dimensions, mode, tgt, isPlaceholder> &Acc,
+    cl::sycl::event &Event) {
+  auto *AccBase = Acc.template accessor_base<T, Dimensions, mode, tgt,
+                                             isPlaceholder>::__impl();
+  assert(AccBase != nullptr &&
+         "Accessor should have an initialized accessor_base");
+  detail::buffer_impl<T, Dimensions> *Buf = AccBase->m_Buf;
+
+  updateHost<mode, tgt>(*Buf, Event);
+}
+
+template <access::mode Mode, access::target Target, typename T, int Dimensions,
+          typename AllocatorT>
+void Scheduler::copyBack(detail::buffer_impl<T, Dimensions, AllocatorT> &Buf) {
+  cl::sycl::event Event;
+  updateHost<Mode, Target>(Buf, Event);
+  detail::getSyclObjImpl(Event)->waitInternal();
+}
+
+// Updates host data of the specified buffer_impl
+template <access::mode Mode, access::target Target, typename T, int Dimensions,
+          typename AllocatorT>
+void Scheduler::updateHost(detail::buffer_impl<T, Dimensions, AllocatorT> &Buf,
+                           cl::sycl::event &Event) {
+  CommandPtr UpdateHostCmd;
+  BufferReqPtr BufStor =
+      std::make_shared<BufferStorage<T, Dimensions, AllocatorT, Mode, Target>>(
+          Buf);
+
+  if (0 == m_BuffersEvolution.count(BufStor)) {
+    return;
+  }
+
+  // TODO: Find a better way to say that we need copy to HOST, just nullptr?
+  cl::sycl::device HostDevice;
+  UpdateHostCmd = std::make_shared<MemMoveCommand>(
+      BufStor, m_BuffersEvolution[BufStor].back()->getQueue(),
+      detail::getSyclObjImpl(cl::sycl::queue(HostDevice)),
+      cl::sycl::access::mode::read_write);
+
+  // Add dependency if there was operations with the buffer already.
+  UpdateHostCmd->addDep(m_BuffersEvolution[BufStor].back(), BufStor);
+
+  m_BuffersEvolution[BufStor].push_back(UpdateHostCmd);
+  Event = EnqueueCommand(std::move(UpdateHostCmd));
+}
+
+template <typename T, int Dimensions, typename AllocatorT>
+void Scheduler::removeBuffer(
+    detail::buffer_impl<T, Dimensions, AllocatorT> &Buf) {
+  BufferReqPtr BufStor = std::make_shared<
+      BufferStorage<T, Dimensions, AllocatorT, access::mode::read_write,
+                    access::target::host_buffer>>(Buf);
+
+  if (0 == m_BuffersEvolution.count(BufStor)) {
+    return;
+  }
+
+  for (auto Cmd : m_BuffersEvolution[BufStor]) {
+    Cmd->removeAllDeps();
+  }
+
+  m_BuffersEvolution.erase(BufStor);
+}
+
+static bool cmdsHaveEqualCxtAndDev(const CommandPtr &LHS,
+                                   const CommandPtr &RHS) {
+  return LHS->getQueue()->get_device() == RHS->getQueue()->get_device() &&
+         LHS->getQueue()->get_context() == LHS->getQueue()->get_context();
+}
+
+// Adds new node to graph, creating an Alloca and MemMove commands if
+// needed.
+inline cl::sycl::event Scheduler::addNode(Node NewNode) {
+  // Process global buffers.
+  CommandPtr Cmd = NewNode.getKernel();
+  for (auto Buf : NewNode.getRequirements()) {
+    // If it's the first command for buffer - insert alloca command.
+    if (m_BuffersEvolution[Buf].empty()) {
+      CommandPtr AllocaCmd =
+          std::make_shared<AllocaCommand>(Buf, std::move(NewNode.getQueue()),
+                                          cl::sycl::access::mode::read_write);
+      m_BuffersEvolution[Buf].push_back(AllocaCmd);
+    }
+    // If targets of previous and new command differ - insert memmove command.
+    if (!cmdsHaveEqualCxtAndDev(m_BuffersEvolution[Buf].back(), Cmd)) {
+      CommandPtr MemMoveCmd = std::make_shared<MemMoveCommand>(
+          Buf, std::move(m_BuffersEvolution[Buf].back()->getQueue()),
+          std::move(NewNode.getQueue()), cl::sycl::access::mode::read_write);
+      MemMoveCmd->addDep(m_BuffersEvolution[Buf].back(), Buf);
+      m_BuffersEvolution[Buf].push_back(MemMoveCmd);
+    }
+    // Finally insert command to the buffer evolution vector.
+    Cmd->addDep(m_BuffersEvolution[Buf].back(), Buf);
+    m_BuffersEvolution[Buf].push_back(Cmd);
+  }
+  // Process arguments set via interoperability interface
+  for (auto Arg : NewNode.getInteropArgs()) {
+    Cmd->addInteropArg(Arg);
+  }
+  // If the kernel has no requirements, store the event
+  if (NewNode.getRequirements().empty()) {
+    m_EventsWithoutRequirements.push_back(
+        detail::getSyclObjImpl(Cmd->getEvent()));
+  }
+  return EnqueueCommand(Cmd);
+}
+//}
+} // namespace simple_scheduler
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/scheduler/scheduler.h b/sycl/include/CL/sycl/detail/scheduler/scheduler.h
new file mode 100644
index 000000000000..66b61c3c868c
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/scheduler/scheduler.h
@@ -0,0 +1,233 @@
+//==----------- scheduler.h ------------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/scheduler/commands.h>
+#include <CL/sycl/detail/scheduler/requirements.h>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/event.hpp>
+#include <CL/sycl/nd_range.hpp>
+
+#include <array>
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <set>
+#include <vector>
+
+namespace cl {
+namespace sycl {
+// Forward declaration
+template <typename dataT, int dimensions, access::mode accessMode,
+          access::target accessTarget, access::placeholder isPlaceholder>
+class accessor;
+
+namespace detail {
+class queue_impl;
+}
+using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
+
+namespace simple_scheduler {
+
+class Node {
+public:
+  Node(QueueImplPtr Queue) : m_Queue(std::move(Queue)) {}
+
+  Node(Node &&RHS)
+      : m_Bufs(std::move(RHS.m_Bufs)),
+        m_InteropArgs(std::move(RHS.m_InteropArgs)),
+        m_Kernel(std::move(RHS.m_Kernel)), m_Queue(std::move(RHS.m_Queue)),
+        m_NextOCLIndex(RHS.m_NextOCLIndex) {}
+
+  // Adds a buffer requirement for this node.
+  template <access::mode Mode, access::target Target, typename T,
+            int Dimensions, typename AllocatorT>
+  void addBufRequirement(detail::buffer_impl<T, Dimensions, AllocatorT> &Buf);
+
+  // Adds an accessor requirement for this node.
+  template <typename dataT, int dimensions, access::mode accessMode,
+            access::target accessTarget = access::target::global_buffer,
+            access::placeholder isPlaceholder = access::placeholder::false_t>
+  void addAccRequirement(accessor<dataT, dimensions, accessMode, accessTarget,
+                                  isPlaceholder> &&Acc,
+                         int argIndex);
+
+  // Adds a kernel to this node, maps to single task.
+  template <typename KernelType>
+  void addKernel(const std::string &KernelName, const int KernelArgsNum,
+                 const detail::kernel_param_desc_t *KernelArgs,
+                 KernelType KernelFunc, cl_kernel ClKernel = nullptr);
+
+  // Adds kernel to this node, maps on range parallel for.
+  template <typename KernelType, int Dimensions, typename KernelArgType>
+  void addKernel(const std::string &KernelName, const int KernelArgsNum,
+                 const detail::kernel_param_desc_t *KernelArgs,
+                 KernelType KernelFunc, range<Dimensions> NumWorkItems,
+                 cl_kernel ClKernel = nullptr);
+
+  // Adds kernel to this node, maps on range parallel for with offset.
+  template <typename KernelType, int Dimensions, typename KernelArgType>
+  void addKernel(const std::string &KernelName, const int KernelArgsNum,
+                 const detail::kernel_param_desc_t *KernelArgs,
+                 KernelType KernelFunc, range<Dimensions> NumWorkItems,
+                 id<Dimensions> WorkItemOffset, cl_kernel ClKernel = nullptr);
+
+  // Adds kernel to this node, maps on nd_range parallel for.
+  template <typename KernelType, int Dimensions>
+  void addKernel(const std::string &KernelName, const int KernelArgsNum,
+                 const detail::kernel_param_desc_t *KernelArgs,
+                 KernelType KernelFunc, nd_range<Dimensions> ExecutionRange,
+                 cl_kernel ClKernel = nullptr);
+
+  // Adds explicit memory operation to this node, maps on handler fill method
+  template <typename T, int Dimensions, access::mode mode, access::target tgt,
+            access::placeholder isPlaceholder = access::placeholder::false_t>
+  void addExplicitMemOp(accessor<T, Dimensions, mode, tgt, isPlaceholder> &Dest,
+                        T Src);
+
+  // Adds explicit memory operation to this node, maps on handler copy method
+  template <
+      typename T_src, int dim_src, access::mode mode_src,
+      access::target tgt_src, typename T_dest, int dim_dest,
+      access::mode mode_dest, access::target tgt_dest,
+      access::placeholder isPlaceholder_src = access::placeholder::false_t,
+      access::placeholder isPlaceholder_dest = access::placeholder::false_t>
+  void addExplicitMemOp(
+      accessor<T_src, dim_src, mode_src, tgt_src, isPlaceholder_src> Src,
+      accessor<T_dest, dim_dest, mode_dest, tgt_dest, isPlaceholder_dest> Dest);
+
+  std::set<BufferReqPtr, classcomp> &getRequirements() { return m_Bufs; }
+
+  void addInteropArg(shared_ptr_class<void> Ptr, size_t Size, int ArgIndex,
+                     BufferReqPtr BufReq = nullptr);
+
+  std::vector<InteropArg> &getInteropArgs() { return m_InteropArgs; }
+
+  CommandPtr getKernel() { return m_Kernel; }
+
+  QueueImplPtr getQueue() { return m_Queue; }
+
+private:
+  // Contains buffer requirements for this node.
+  std::set<BufferReqPtr, classcomp> m_Bufs;
+  // Contains arguments set via interoperability methods
+  std::vector<InteropArg> m_InteropArgs;
+  // Represent execute kernel command.
+  CommandPtr m_Kernel;
+
+  // SYCL queue for current command group.
+  QueueImplPtr m_Queue;
+
+  // WORKAROUND. Id for mapping OpenCL buffer to OpenCL kernel argument.
+  size_t m_NextOCLIndex = 0;
+};
+
+class Scheduler {
+public:
+  // Adds copying of the specified buffer_impl and waits for completion.
+  template <access::mode Mode, access::target Target, typename T,
+            int Dimensions, typename AllocatorT>
+  void copyBack(detail::buffer_impl<T, Dimensions, AllocatorT> &Buf);
+
+  // Updates host data of the specified buffer_impl
+  template <access::mode Mode, access::target Target, typename T,
+            int Dimensions, typename AllocatorT>
+  void updateHost(detail::buffer_impl<T, Dimensions, AllocatorT> &Buf,
+                  cl::sycl::event &Event);
+
+  // Updates host data of the specified accessor
+  template <typename T, int Dimensions, access::mode mode, access::target tgt,
+            access::placeholder isPlaceholder>
+  void updateHost(accessor<T, Dimensions, mode, tgt, isPlaceholder> &Acc,
+                  cl::sycl::event &Event);
+
+  // Frees the specified buffer_impl.
+  template <typename T, int Dimensions, typename AllocatorT>
+  void removeBuffer(detail::buffer_impl<T, Dimensions, AllocatorT> &Buf);
+
+  // Waits for the event passed.
+  void waitForEvent(EventImplPtr Event);
+
+  // Adds new node to graph, creating an Alloca and MemMove commands if
+  // needed.
+  cl::sycl::event addNode(Node NewNode);
+
+  void print(std::ostream &Stream) const;
+  void printDot(std::ostream &Stream) const;
+  void dump() const { print(std::cout); }
+
+  void dumpGraph() const {
+    std::fstream GraphDot("graph.dot", std::ios::out);
+    printDot(GraphDot);
+  }
+
+  void dumpGraphForCommand(CommandPtr Cmd) const;
+
+  void optimize() { parallelReadOpt(); }
+
+  // Converts the following:
+  //
+  //  =========    =========     =========
+  // | kernel1 |<-| kernel2 |<--| kernel3 |
+  // | write A |  | read A  |   | read A  |
+  //  =========    =========     =========
+  //
+  // to: ---------------------------
+  //     \/                        |
+  //  =========    =========     =========
+  // | kernel1 |<-| kernel2 |   | kernel3 |
+  // | write A |  | read A  |   | read A  |
+  //  =========    =========     =========
+  //
+  void parallelReadOpt();
+
+  static Scheduler &getInstance() {
+    static Scheduler instance;
+    return instance;
+  }
+
+  enum DumpOptions { Text = 0, WholeGraph = 1, RunGraph = 2 };
+  bool getDumpFlagValue(DumpOptions DumpOption);
+
+protected:
+  // TODO: Add releasing of OpenCL buffers.
+
+  void enqueueAndWaitForCommand(CommandPtr Cmd);
+
+  // Enqueues Cmd command and all its dependencies.
+  cl::sycl::event EnqueueCommand(CommandPtr Cmd);
+
+  cl::sycl::event dispatch(CommandPtr Cmd);
+
+  // Recursively generates dot records for the command passed and all that the
+  // command depends on.
+  void printGraphForCommand(CommandPtr Cmd, std::ostream &Stream) const;
+
+private:
+  Scheduler();
+  ~Scheduler();
+  std::array<unsigned char, 3> m_DumpOptions;
+  // Buffer that represents evolution of buffers - actions that is added
+  // for each buffer.
+  std::map<BufferReqPtr, std::vector<CommandPtr>, classcomp> m_BuffersEvolution;
+  // Events for tracking execution of kernels without requirements
+  std::vector<EventImplPtr> m_EventsWithoutRequirements;
+  // TODO: At some point of time we should remove already processed commands.
+  // But we have to be sure that nobody will references them(thru events).
+
+  Scheduler(Scheduler const &) = delete;
+  Scheduler &operator=(Scheduler const &) = delete;
+};
+
+} // namespace simple_scheduler
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/device.hpp b/sycl/include/CL/sycl/device.hpp
new file mode 100644
index 000000000000..9b660d357f93
--- /dev/null
+++ b/sycl/include/CL/sycl/device.hpp
@@ -0,0 +1,116 @@
+//==------------------- device.hpp - SYCL device ---------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/device_impl.hpp>
+#include <CL/sycl/info/info_desc.hpp>
+#include <CL/sycl/platform.hpp>
+#include <CL/sycl/stl.hpp>
+#include <memory>
+#include <utility>
+
+namespace cl {
+namespace sycl {
+// Forward declarations
+class device_selector;
+
+// TODO: 4.6.4 Partitioning into multiple SYCL devices
+// TODO: 4.6.4.2 Device information descriptors
+// TODO: Make code thread-safe
+class device {
+public:
+  device();
+
+  explicit device(cl_device_id deviceId);
+
+  explicit device(const device_selector &deviceSelector);
+
+  bool operator==(const device &rhs) const { return impl == rhs.impl; }
+
+  bool operator!=(const device &rhs) const { return !(*this == rhs); }
+
+  device(const device &rhs) = default;
+
+  device(device &&rhs) = default;
+
+  device &operator=(const device &rhs) = default;
+
+  device &operator=(device &&rhs) = default;
+
+  cl_device_id get() const { return impl->get(); }
+
+  bool is_host() const { return impl->is_host(); }
+
+  bool is_cpu() const { return impl->is_cpu(); }
+
+  bool is_gpu() const { return impl->is_gpu(); }
+
+  bool is_accelerator() const { return impl->is_accelerator(); }
+
+  platform get_platform() const { return impl->get_platform(); }
+
+  // Available only when prop == info::partition_property::partition_equally
+  template <info::partition_property prop>
+  typename std::enable_if<(prop == info::partition_property::partition_equally),
+                          vector_class<device>>::type
+  create_sub_devices(size_t ComputeUnits) const {
+    return impl->create_sub_devices(ComputeUnits);
+  }
+
+  // Available only when prop == info::partition_property::partition_by_counts
+  template <info::partition_property prop>
+  typename std::enable_if<(prop ==
+                           info::partition_property::partition_by_counts),
+                          vector_class<device>>::type
+  create_sub_devices(const vector_class<size_t> &Counts) const {
+    return impl->create_sub_devices(Counts);
+  }
+
+  // Available only when prop ==
+  // info::partition_property::partition_by_affinity_domain
+  template <info::partition_property prop>
+  typename std::enable_if<
+      (prop == info::partition_property::partition_by_affinity_domain),
+      vector_class<device>>::type
+  create_sub_devices(info::partition_affinity_domain AffinityDomain) const {
+    return impl->create_sub_devices(AffinityDomain);
+  }
+
+  template <info::device param>
+  typename info::param_traits<info::device, param>::return_type
+  get_info() const {
+    return impl->get_info<param>();
+  }
+
+  bool has_extension(const string_class &extension_name) const {
+    return impl->has_extension(extension_name);
+  }
+
+  static vector_class<device>
+  get_devices(info::device_type deviceType = info::device_type::all);
+
+private:
+  std::shared_ptr<detail::device_impl> impl;
+  template <class T>
+  friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject);
+};
+
+} // namespace sycl
+} // namespace cl
+
+namespace std {
+template <> struct hash<cl::sycl::device> {
+  size_t operator()(const cl::sycl::device &d) const {
+    return hash<std::shared_ptr<cl::sycl::detail::device_impl>>()(
+        cl::sycl::detail::getSyclObjImpl(d));
+  }
+};
+} // namespace std
diff --git a/sycl/include/CL/sycl/device_event.hpp b/sycl/include/CL/sycl/device_event.hpp
new file mode 100644
index 000000000000..9a057d39e999
--- /dev/null
+++ b/sycl/include/CL/sycl/device_event.hpp
@@ -0,0 +1,37 @@
+//==---------- device_event.hpp --- SYCL device event ---------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/__spirv/spirv_ops.hpp>
+#include <CL/__spirv/spirv_types.hpp>
+
+namespace cl {
+namespace sycl {
+
+class device_event {
+private:
+  cl::__spirv::OpTypeEvent *m_Event;
+
+public:
+  device_event(const device_event &rhs) = default;
+  device_event(device_event &&rhs) = default;
+  device_event &operator=(const device_event &rhs) = default;
+  device_event &operator=(device_event &&rhs) = default;
+
+  device_event(cl::__spirv::OpTypeEvent *Event) : m_Event(Event) {}
+
+  void wait() {
+    cl::__spirv::OpGroupWaitEvents(cl::__spirv::Scope::Workgroup, 1,
+                                   &m_Event);
+  }
+};
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/device_selector.hpp b/sycl/include/CL/sycl/device_selector.hpp
new file mode 100644
index 000000000000..1e70a55d44b4
--- /dev/null
+++ b/sycl/include/CL/sycl/device_selector.hpp
@@ -0,0 +1,55 @@
+//==------ device_selector.hpp - SYCL device selector ---------*- C++ --*---==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+// 4.6.1 Device selection class
+
+namespace cl {
+namespace sycl {
+
+// Forward declarations
+class device;
+
+class device_selector {
+public:
+  virtual ~device_selector() = default;
+
+  device select_device() const;
+
+  virtual int operator()(const device &device) const = 0;
+};
+
+class default_selector : public device_selector {
+public:
+  int operator()(const device &dev) const override;
+};
+
+class gpu_selector : public device_selector {
+public:
+  int operator()(const device &dev) const override;
+};
+
+class cpu_selector : public device_selector {
+public:
+  int operator()(const device &dev) const override;
+};
+
+class accelerator_selector : public device_selector {
+public:
+  int operator()(const device &dev) const override;
+};
+
+class host_selector : public device_selector {
+public:
+  int operator()(const device &dev) const override;
+};
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/event.hpp b/sycl/include/CL/sycl/event.hpp
new file mode 100644
index 000000000000..3ac11194bbf5
--- /dev/null
+++ b/sycl/include/CL/sycl/event.hpp
@@ -0,0 +1,83 @@
+//==---------------- event.hpp --- SYCL event ------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/event_impl.hpp>
+#include <CL/sycl/stl.hpp>
+
+#include <memory>
+
+namespace cl {
+namespace sycl {
+// Forward declaration
+class context;
+class event {
+public:
+  event();
+
+  event(cl_event clEvent, const context &syclContext);
+
+  event(const event &rhs) = default;
+
+  event(event &&rhs) = default;
+
+  event &operator=(const event &rhs) = default;
+
+  event &operator=(event &&rhs) = default;
+
+  bool operator==(const event &rhs) const;
+
+  bool operator!=(const event &rhs) const;
+
+  cl_event get();
+
+  bool is_host() const;
+
+  void wait() const;
+
+  // vector_class<event> get_wait_list();
+
+  // static void wait(const vector_class<event> &eventList);
+
+  // void wait_and_throw();
+
+  // static void wait_and_throw(const vector_class<event> &eventList);
+
+  template <info::event param>
+  typename info::param_traits<info::event, param>::return_type get_info() const;
+
+  template <info::event_profiling param>
+  typename info::param_traits<info::event_profiling, param>::return_type
+  get_profiling_info() const;
+
+private:
+  event(std::shared_ptr<detail::event_impl> event_impl);
+
+  std::shared_ptr<detail::event_impl> impl;
+
+  template <class T>
+  friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject);
+
+  template <class T>
+  friend T detail::createSyclObjFromImpl(decltype(T::impl) ImplObj);
+};
+
+} // namespace sycl
+} // namespace cl
+
+namespace std {
+template <> struct hash<cl::sycl::event> {
+  size_t operator()(const cl::sycl::event &e) const {
+    return hash<std::shared_ptr<cl::sycl::detail::event_impl>>()(
+        cl::sycl::detail::getSyclObjImpl(e));
+  }
+};
+} // namespace std
diff --git a/sycl/include/CL/sycl/exception.hpp b/sycl/include/CL/sycl/exception.hpp
new file mode 100644
index 000000000000..67dd1d3242e3
--- /dev/null
+++ b/sycl/include/CL/sycl/exception.hpp
@@ -0,0 +1,115 @@
+//==---------------- exception.hpp - SYCL exception ------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+// 4.9.2 Exception Class Interface
+
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/stl.hpp>
+#include <exception>
+
+namespace cl {
+namespace sycl {
+
+class context;
+
+struct exception {
+  exception() = default;
+
+  const char *what() const noexcept { return msg.c_str(); }
+  bool has_context() const;
+  context get_context() const;
+  cl_int get_cl_code() const;
+
+private:
+  std::string msg = "Message not specified";
+  cl_int cl_err = CL_SUCCESS;
+  shared_ptr_class<context> Context;
+
+protected:
+  exception(const char *msg, int cl_err = CL_SUCCESS,
+            shared_ptr_class<context> Context = nullptr)
+      : msg(std::string(msg) + " " +
+            ((cl_err == CL_SUCCESS) ? "" : OCL_CODE_TO_STR(cl_err))),
+        cl_err(cl_err), Context(Context) {}
+};
+
+class exception_list {
+  using list_t = vector_class<exception_ptr_class>;
+  list_t list;
+
+public:
+  using value_type = exception_ptr_class;
+  using reference = value_type &;
+  using const_reference = const value_type &;
+  using size_type = ::size_t;
+  using iterator = list_t::const_iterator;
+  using const_iterator = list_t::const_iterator;
+
+  ::size_t size() const { return list.size(); }
+
+  void clear() noexcept {
+    list.clear();
+  }
+
+  void push_back(const_reference value) {
+    list.push_back(value);
+  }
+
+  void push_back(value_type&& value) {
+    list.push_back(std::move(value));
+  }
+
+  /** first asynchronous exception */
+  iterator begin() const { return list.begin(); }
+  /** refer to past-the-end last asynchronous exception */
+  iterator end() const { return list.end(); }
+
+  bool operator==(const exception_list &rhs) const { return list == rhs.list; }
+
+  bool operator!=(const exception_list &rhs) const { return !(*this == rhs); }
+};
+
+using async_handler = function_class<void(cl::sycl::exception_list)>;
+
+class runtime_error : public exception {
+public:
+  runtime_error(const char *str, cl_int err = CL_SUCCESS)
+      : exception(str, err) {}
+};
+class kernel_error : public runtime_error {
+  using runtime_error::runtime_error;
+};
+class accessor_error : public runtime_error {};
+class nd_range_error : public runtime_error {};
+class event_error : public runtime_error {};
+class invalid_parameter_error : public runtime_error {
+  using runtime_error::runtime_error;
+};
+class device_error : public exception {
+public:
+  device_error(const char *str, cl_int err = CL_SUCCESS)
+      : exception(str, err) {}
+  device_error() : device_error("") {}
+};
+class compile_program_error : public device_error {
+  using device_error::device_error;
+};
+class link_program_error : public device_error {};
+class invalid_object_error : public device_error {
+  using device_error::device_error;
+};
+class memory_allocation_error : public device_error {};
+class platform_error : public device_error {};
+class profiling_error : public device_error {};
+class feature_not_supported : public device_error {};
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/group.hpp b/sycl/include/CL/sycl/group.hpp
new file mode 100644
index 000000000000..969efaa104b5
--- /dev/null
+++ b/sycl/include/CL/sycl/group.hpp
@@ -0,0 +1,195 @@
+//==-------------- group.hpp --- SYCL work group ---------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/__spirv/spirv_ops.hpp>
+#include <CL/sycl/device_event.hpp>
+#include <CL/sycl/id.hpp>
+#include <CL/sycl/pointers.hpp>
+#include <CL/sycl/range.hpp>
+#include <stdexcept>
+#include <type_traits>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+class Builder;
+} // namespace detail
+
+template <int dimensions = 1> class group {
+public:
+  group() = delete;
+
+  id<dimensions> get_id() const { return index; }
+
+  size_t get_id(int dimension) const { return index[dimension]; }
+
+  range<dimensions> get_global_range() const { return globalRange; }
+
+  size_t get_global_range(int dimension) const {
+    return globalRange[dimension];
+  }
+
+  range<dimensions> get_local_range() const { return localRange; }
+
+  size_t get_local_range(int dimension) const { return localRange[dimension]; }
+
+  range<dimensions> get_group_range() const { return localRange; }
+
+  size_t get_group_range(int dimension) const { return localRange[dimension]; }
+
+  size_t operator[](int dimension) const { return index[dimension]; }
+
+  template <int dims = dimensions>
+  typename std::enable_if<(dims == 1), size_t>::type get_linear() const {
+    range<dimensions> groupNum = globalRange / localRange;
+    return index[0];
+  }
+
+  template <int dims = dimensions>
+  typename std::enable_if<(dims == 2), size_t>::type get_linear() const {
+    range<dimensions> groupNum = globalRange / localRange;
+    return index[1] * groupNum[0] + index[0];
+  }
+
+  template <int dims = dimensions>
+  typename std::enable_if<(dims == 3), size_t>::type get_linear() const {
+    range<dimensions> groupNum = globalRange / localRange;
+    return (index[2] * groupNum[1] * groupNum[0]) + (index[1] * groupNum[0]) +
+           index[0];
+  }
+
+  // template<typename workItemFunctionT>
+  // void parallel_for_work_item(workItemFunctionT func) const;
+
+  // template<typename workItemFunctionT>
+  // void parallel_for_work_item(range<dimensions> flexibleRange,
+  // workItemFunctionT func) const;
+
+  /// Executes a work-group mem-fence with memory ordering on the local address
+  /// space, global address space or both based on the value of \p accessSpace.
+  template <access::mode accessMode = access::mode::read_write>
+  void mem_fence(typename std::enable_if<
+                     accessMode == access::mode::read ||
+                     accessMode == access::mode::write ||
+                     accessMode == access::mode::read_write,
+                     access::fence_space>::type accessSpace =
+                     access::fence_space::global_and_local) const {
+    uint32_t flags = ::cl::__spirv::MemorySemantics::SequentiallyConsistent;
+    switch (accessSpace) {
+    case access::fence_space::global_space:
+      flags |= cl::__spirv::MemorySemantics::CrossWorkgroupMemory;
+      break;
+    case access::fence_space::local_space:
+      flags |= cl::__spirv::MemorySemantics::WorkgroupMemory;
+      break;
+    case access::fence_space::global_and_local:
+    default:
+      flags |= cl::__spirv::MemorySemantics::CrossWorkgroupMemory |
+               cl::__spirv::MemorySemantics::WorkgroupMemory;
+      break;
+    }
+    // TODO: currently, there is no good way in SPIRV to set the memory
+    // barrier only for load operations or only for store operations.
+    // The full read-and-write barrier is used and the template parameter
+    // 'accessMode' is ignored for now. Either SPIRV or SYCL spec may be
+    // changed to address this discrepancy between SPIRV and SYCL,
+    // or if we decide that 'accessMode' is the important feature then
+    // we can fix this later, for example, by using OpenCL 1.2 functions
+    // read_mem_fence() and write_mem_fence().
+    cl::__spirv::OpMemoryBarrier(cl::__spirv::Scope::Workgroup, flags);
+  }
+
+  template <typename dataT>
+  device_event async_work_group_copy(local_ptr<dataT> dest,
+                                     global_ptr<dataT> src,
+                                     size_t numElements) const {
+    cl::__spirv::OpTypeEvent *e =
+        cl::__spirv::OpGroupAsyncCopyGlobalToLocal<dataT>(
+            cl::__spirv::Scope::Workgroup,
+            dest.get(), src.get(), numElements, 1, 0);
+    return device_event(e);
+  }
+
+  template <typename dataT>
+  device_event async_work_group_copy(global_ptr<dataT> dest,
+                                     local_ptr<dataT> src,
+                                     size_t numElements) const {
+    cl::__spirv::OpTypeEvent *e =
+        cl::__spirv::OpGroupAsyncCopyLocalToGlobal<dataT>(
+            cl::__spirv::Scope::Workgroup,
+            dest.get(), src.get(), numElements, 1, 0);
+    return device_event(e);
+  }
+
+  template <typename dataT>
+  device_event async_work_group_copy(local_ptr<dataT> dest,
+                                     global_ptr<dataT> src,
+                                     size_t numElements,
+                                     size_t srcStride) const {
+    cl::__spirv::OpTypeEvent *e =
+        cl::__spirv::OpGroupAsyncCopyGlobalToLocal<dataT>(
+            cl::__spirv::Scope::Workgroup,
+            dest.get(), src.get(), numElements, srcStride, 0);
+    return device_event(e);
+  }
+
+  template <typename dataT>
+  device_event async_work_group_copy(global_ptr<dataT> dest,
+                                     local_ptr<dataT> src,
+                                     size_t numElements,
+                                     size_t destStride) const {
+    cl::__spirv::OpTypeEvent *e =
+        cl::__spirv::OpGroupAsyncCopyLocalToGlobal<dataT>(
+            cl::__spirv::Scope::Workgroup,
+            dest.get(), src.get(), numElements, destStride, 0);
+    return device_event(e);
+  }
+
+  template <typename... eventTN>
+  void wait_for(eventTN... Events) const {
+    waitForHelper(Events...);
+  }
+
+  bool operator==(const group<dimensions> &rhs) const {
+    return (rhs.globalRange == this->globalRange) &&
+           (rhs.localRange == this->localRange) && (rhs.index == this->index);
+  }
+
+  bool operator!=(const group<dimensions> &rhs) const {
+    return !((*this) == rhs);
+  }
+
+private:
+  range<dimensions> globalRange;
+  range<dimensions> localRange;
+  id<dimensions> index;
+
+  void waitForHelper() const {}
+
+  void waitForHelper(device_event Event) const {
+    Event.wait();
+  }
+
+  template <typename T, typename... Ts>
+  void waitForHelper(T E, Ts... Es) const {
+    waitForHelper(E);
+    waitForHelper(Es...);
+  }
+
+protected:
+  friend class detail::Builder;
+  group(const range<dimensions> &G, const range<dimensions> &L,
+        const id<dimensions> &I)
+      : globalRange(G), localRange(L), index(I) {}
+};
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
new file mode 100644
index 000000000000..cfd909279619
--- /dev/null
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -0,0 +1,691 @@
+//==-------- handler.hpp --- SYCL command group handler --------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/event.hpp>
+#include <CL/sycl/id.hpp>
+#include <CL/sycl/kernel.hpp>
+#include <CL/sycl/nd_item.hpp>
+#include <CL/sycl/nd_range.hpp>
+#include <CL/sycl/property_list.hpp>
+#include <CL/sycl/stl.hpp>
+
+#include <CL/sycl/detail/scheduler/scheduler.h>
+
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+#ifdef __SYCL_DEVICE_ONLY__
+size_t get_global_size(uint dimindx);
+size_t get_local_size(uint dimindx);
+size_t get_global_id(uint dimindx);
+size_t get_local_id(uint dimindx);
+size_t get_global_offset(uint dimindx);
+size_t get_group_id(uint dimindx);
+#endif
+
+template <typename T_src, int dim_src, cl::sycl::access::mode mode_src,
+          cl::sycl::access::target tgt_src, typename T_dest, int dim_dest,
+          cl::sycl::access::mode mode_dest, cl::sycl::access::target tgt_dest,
+          cl::sycl::access::placeholder isPlaceholder_src,
+          cl::sycl::access::placeholder isPlaceholder_dest>
+class __copy;
+
+template <typename DataT, int Dimensions, cl::sycl::access::mode AccessMode,
+          cl::sycl::access::target AccessTarget,
+          cl::sycl::access::placeholder isPlaceholder>
+class __update_host;
+
+template <typename DataT, int Dimensions, cl::sycl::access::mode AccessMode,
+          cl::sycl::access::target AccessTarget,
+          cl::sycl::access::placeholder isPlaceholder>
+class __fill;
+
+namespace cl {
+namespace sycl {
+// Forward declaration
+class queue;
+
+template <typename DataT, int Dimensions, access::mode AccessMode,
+          access::target AccessTarget, access::placeholder IsPlaceholder>
+class accessor;
+template <typename T, int Dimensions, typename AllocatorT> class buffer;
+namespace detail {
+class queue_impl;
+template <typename dataT, int dimensions, access::mode accessMode,
+          access::target accessTarget, access::placeholder isPlaceholder,
+          typename voidT>
+class accessor_impl;
+
+template <typename T, int dimensions, typename AllocatorT> class buffer_impl;
+// Type inference of first arg from a lambda
+// auto fun = [&](item a) { a; };
+// lambda_arg_type<decltype(fun)> value; # value type is item
+
+// Templated static declaration of a function whose single parameter is a
+// pointer to a member function of type 'Func'. The member function must have
+// 'RetType' return type, single argument of type 'Arg' and be declared with
+// the 'const' qualifier.
+template <typename RetType, typename Func, typename Arg>
+static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
+
+// Non-const version of the above template to match functors whose 'operator()'
+// is declared w/o the 'const' qualifier.
+template <typename RetType, typename Func, typename Arg>
+static Arg member_ptr_helper(RetType (Func::*)(Arg));
+
+template <typename F>
+decltype(member_ptr_helper(&F::operator())) argument_helper(F);
+
+template <typename T>
+using lambda_arg_type = decltype(argument_helper(std::declval<T>()));
+
+} // namespace detail
+
+template <typename dataT, int dimensions, access::mode accessMode,
+          access::target accessTarget, access::placeholder isPlaceholder>
+class accessor_base;
+
+template <typename dataT, int dimensions, access::mode accessMode,
+          access::target accessTarget, access::placeholder isPlaceholder>
+class accessor;
+
+// 4.8.3 Command group handler class
+class handler {
+  template <typename dataT, int dimensions, access::mode accessMode,
+            access::target accessTarget, access::placeholder isPlaceholder>
+  friend class accessor;
+
+  template <typename dataT, int dimensions, access::mode accessMode,
+            access::target accessTarget, access::placeholder isPlaceholder,
+            typename voidT>
+  friend class detail::accessor_impl;
+
+  template <typename T, int dimensions, typename AllocatorT>
+  friend class detail::buffer_impl;
+
+  friend class detail::queue_impl;
+
+protected:
+  simple_scheduler::Node m_Node;
+  bool isHost = false;
+  unique_ptr_class<event> m_Finalized;
+  // TODO: Obtain is host information from Queue when we split queue_impl
+  // interface and implementation.
+  handler(std::shared_ptr<detail::queue_impl> Queue, bool host)
+      : m_Node(std::move(Queue)), isHost(host) {}
+
+  event finalize() {
+    if (!m_Finalized) {
+      event *Event =
+          new event(simple_scheduler::Scheduler::getInstance().addNode(
+              std::move(m_Node)));
+      m_Finalized.reset(Event);
+    }
+    return *m_Finalized.get();
+  }
+
+  ~handler() = default;
+
+  bool is_host() { return isHost; }
+
+  template <access::mode mode, access::target target, typename T,
+            int dimensions, typename AllocatorT>
+  void AddBufDep(detail::buffer_impl<T, dimensions, AllocatorT> &Buf) {
+    m_Node.addBufRequirement<mode, target>(Buf);
+  }
+
+  template <typename T, typename... Ts>
+  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&... Args) {
+    set_arg(ArgIndex, std::move(Arg));
+    setArgsHelper(++ArgIndex, std::move(Args)...);
+  }
+
+  void setArgsHelper(int ArgIndex) {}
+
+  template <typename dataT, int dimensions, access::mode accessMode,
+            access::target accessTarget, access::placeholder isPlaceholder>
+  void setArgHelper(int argIndex, accessor<dataT, dimensions, accessMode,
+                                           accessTarget, isPlaceholder> &&arg) {
+    m_Node.addAccRequirement<dataT, dimensions, accessMode, accessTarget,
+                             isPlaceholder>(std::move(arg), argIndex);
+  }
+
+  template <typename T> void setArgHelper(int argIndex, T &&arg) {
+    using Type = typename std::remove_reference<T>::type;
+    shared_ptr_class<void> Ptr = std::make_shared<Type>(std::move(arg));
+    m_Node.addInteropArg(Ptr, sizeof(T), argIndex);
+  }
+
+  //  TODO: implement when sampler class is ready
+  //  void setArgHelper(int argIndex, sampler &&arg) {}
+
+  void verifySyclKernelInvoc(const kernel &SyclKernel) {
+    if (is_host()) {
+      throw invalid_object_error(
+          "This kernel invocation method cannot be used on the host");
+    }
+    if (SyclKernel.is_host()) {
+      throw invalid_object_error("Invalid kernel type, OpenCL expected");
+    }
+  }
+
+  // This dummy functor is passed to Node::addKernel in SYCL kernel
+  // parallel_for invocation with range.
+  template <int dimensions> struct DummyFunctor {
+    void operator()(id<dimensions>) {}
+  };
+
+  // Method provides unified getting of the range from an accessor, because
+  // 1 dimension accessor has no get_range method according to the SYCL
+  // specification
+  template <typename T, int dim, access::mode mode, access::target tgt,
+            access::placeholder isPlaceholder = access::placeholder::false_t>
+  struct getAccessorRangeHelper {
+    static range<dim>
+    getAccessorRange(const accessor<T, dim, mode, tgt, isPlaceholder> &Acc) {
+      return Acc.get_range();
+    }
+  };
+
+  template <typename T, access::mode mode, access::target tgt,
+            access::placeholder isPlaceholder>
+  struct getAccessorRangeHelper<T, 1, mode, tgt, isPlaceholder> {
+    static range<1>
+    getAccessorRange(const accessor<T, 1, mode, tgt, isPlaceholder> &Acc) {
+      return range<1>(Acc.get_count());
+    }
+  };
+
+public:
+  handler(const handler &) = delete;
+  handler(handler &&) = delete;
+  handler &operator=(const handler &) = delete;
+  handler &operator=(handler &&) = delete;
+
+  // template <typename dataT, int dimensions, access::mode accessMode,
+  //           access::target accessTarget>
+  // void require(accessor<dataT, dimensions, accessMode, accessTarget,
+  //                       placeholder::true_t> acc);
+
+  // OpenCL interoperability interface
+  template <typename T> void set_arg(int argIndex, T &&arg) {
+    setArgHelper(argIndex, std::move(arg));
+  }
+
+  template <typename... Ts> void set_args(Ts &&... args) {
+    setArgsHelper(0, std::move(args)...);
+  }
+
+#ifdef __SYCL_DEVICE_ONLY__
+  template <typename KernelName, typename KernelType>
+  __attribute__((sycl_kernel)) void kernel_single_task(KernelType kernelFunc) {
+    kernelFunc();
+  }
+#endif
+
+  // Kernel dispatch API
+  // Kernel is represented as a lambda.
+  template <typename KernelName, typename KernelType>
+  void single_task(KernelType kernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_single_task<KernelName>(kernelFunc);
+#else
+    using KI = cl::sycl::detail::KernelInfo<KernelName>;
+    m_Node.addKernel(KI::getName(), KI::getNumParams(), &KI::getParamDesc(0),
+                     std::move(kernelFunc));
+#endif
+  }
+
+  // Kernel is represented as a functor - simply redirect to the lambda-based
+  // form of invocation, setting kernel name type to the functor type.
+  template <typename KernelFunctorType>
+  void single_task(KernelFunctorType KernelFunctor) {
+    single_task<KernelFunctorType, KernelFunctorType>(KernelFunctor);
+  }
+
+#ifdef __SYCL_DEVICE_ONLY__
+  template <typename KernelName, typename KernelType, int dimensions>
+  __attribute__((sycl_kernel)) void kernel_parallel_for(
+      typename std::enable_if<std::is_same<detail::lambda_arg_type<KernelType>,
+                                           id<dimensions>>::value &&
+                                  (dimensions > 0 && dimensions < 4),
+                              KernelType>::type kernelFunc) {
+    id<dimensions> global_id;
+    for (int i = 0; i < dimensions; ++i) {
+      global_id[i] = get_global_id(i);
+    }
+    kernelFunc(global_id);
+  }
+
+  template <typename KernelName, typename KernelType, int dimensions>
+  __attribute__((sycl_kernel)) void kernel_parallel_for(
+      typename std::enable_if<std::is_same<detail::lambda_arg_type<KernelType>,
+                                           item<dimensions>>::value &&
+                                  (dimensions > 0 && dimensions < 4),
+                              KernelType>::type kernelFunc) {
+    id<dimensions> global_id;
+    range<dimensions> global_size;
+    for (int i = 0; i < dimensions; ++i) {
+      global_id[i] = get_global_id(i);
+      global_size[i] = get_global_size(i);
+    }
+    item<dimensions, false> Item =
+        detail::Builder::createItem<dimensions, false>(global_size, global_id);
+    kernelFunc(Item);
+  }
+
+  template <typename KernelName, typename KernelType, int dimensions>
+  __attribute__((sycl_kernel)) void kernel_parallel_for(
+      typename std::enable_if<std::is_same<detail::lambda_arg_type<KernelType>,
+                                           nd_item<dimensions>>::value &&
+                                  (dimensions > 0 && dimensions < 4),
+                              KernelType>::type kernelFunc) {
+    range<dimensions> global_size;
+    range<dimensions> local_size;
+    id<dimensions> group_id;
+    id<dimensions> global_id;
+    id<dimensions> local_id;
+    id<dimensions> global_offset;
+
+    for (int i = 0; i < dimensions; ++i) {
+      global_size[i] = get_global_size(i);
+      local_size[i] = get_local_size(i);
+      group_id[i] = get_group_id(i);
+      global_id[i] = get_global_id(i);
+      local_id[i] = get_local_id(i);
+      global_offset[i] = get_global_offset(i);
+    }
+
+    group<dimensions> Group = detail::Builder::createGroup<dimensions>(
+        global_size, local_size, group_id);
+    item<dimensions, true> globalItem =
+        detail::Builder::createItem<dimensions, true>(global_size, global_id,
+                                                      global_offset);
+    item<dimensions, false> localItem =
+        detail::Builder::createItem<dimensions, false>(local_size, local_id);
+    nd_item<dimensions> Nd_item =
+        detail::Builder::createNDItem<dimensions>(globalItem, localItem, Group);
+
+    kernelFunc(Nd_item);
+  }
+#endif
+
+  template <typename KernelName, typename KernelType, int dimensions>
+  void parallel_for(range<dimensions> numWorkItems, KernelType kernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_parallel_for<KernelName, KernelType, dimensions>(kernelFunc);
+#else
+    using KI = cl::sycl::detail::KernelInfo<KernelName>;
+    m_Node
+        .addKernel<KernelType, dimensions, detail::lambda_arg_type<KernelType>>(
+            KI::getName(), KI::getNumParams(), &KI::getParamDesc(0),
+            std::move(kernelFunc), numWorkItems);
+#endif
+  }
+
+  // The version for a functor kernel.
+  template <typename KernelType, int dimensions>
+  void parallel_for(range<dimensions> numWorkItems, KernelType kernelFunc) {
+    parallel_for<KernelType, KernelType, dimensions>(numWorkItems, kernelFunc);
+  }
+
+  // The version with an offset
+  template <typename KernelName, typename KernelType, int dimensions>
+  void parallel_for(range<dimensions> numWorkItems,
+                    id<dimensions> workItemOffset, KernelType kernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_parallel_for<KernelName, KernelType, dimensions>(kernelFunc);
+#else
+    using KI = cl::sycl::detail::KernelInfo<KernelName>;
+    m_Node
+        .addKernel<KernelType, dimensions, detail::lambda_arg_type<KernelType>>(
+            KI::getName(), KI::getNumParams(), &KI::getParamDesc(0),
+            std::move(kernelFunc), numWorkItems, workItemOffset);
+#endif
+  }
+
+  template <typename KernelName, typename KernelType, int dimensions>
+  void parallel_for(nd_range<dimensions> executionRange,
+                    KernelType kernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_parallel_for<KernelName, KernelType, dimensions>(kernelFunc);
+#else
+    using KI = cl::sycl::detail::KernelInfo<KernelName>;
+    m_Node.addKernel<KernelType, dimensions>(
+        KI::getName(), KI::getNumParams(), &KI::getParamDesc(0),
+        std::move(kernelFunc), executionRange);
+#endif
+  }
+
+  // The version for a functor kernel.
+  template <typename KernelType, int dimensions>
+  void parallel_for(nd_range<dimensions> executionRange,
+                    KernelType kernelFunc) {
+
+    parallel_for<KernelType, KernelType, dimensions>(executionRange,
+                                                     kernelFunc);
+  }
+
+  // template <typename KernelName, typename WorkgroupFunctionType, int
+  // dimensions>
+  // void parallel_for_work_group(range<dimensions> numWorkGroups,
+  //                              WorkgroupFunctionType kernelFunc);
+
+  // template <typename KernelName, typename WorkgroupFunctionType, int
+  // dimensions>
+  // void parallel_for_work_group(range<dimensions> numWorkGroups,
+  //                              range<dimensions> workGroupSize,
+  //                              WorkgroupFunctionType kernelFunc);
+
+  // The kernel invocation methods below have no functors and cannot be
+  // called on host.
+  // TODO current workaround passes dummy functors to Node::addKernel.
+  // A better way of adding kernels to scheduler if they cannot be run on host
+  // would be preferrable.
+  void single_task(kernel syclKernel) {
+    verifySyclKernelInvoc(syclKernel);
+    std::function<void()> DummyLambda = []() {};
+    m_Node.addKernel(syclKernel.get_info<info::kernel::function_name>(), 0,
+                     nullptr, std::move(DummyLambda), syclKernel.get());
+  }
+
+  template <int dimensions>
+  void parallel_for(range<dimensions> numWorkItems, kernel syclKernel) {
+    verifySyclKernelInvoc(syclKernel);
+    m_Node.addKernel<DummyFunctor<dimensions>, dimensions, id<dimensions>>(
+        syclKernel.get_info<info::kernel::function_name>(), 0, nullptr,
+        DummyFunctor<dimensions>(), numWorkItems, syclKernel.get());
+  }
+
+  template <int dimensions>
+  void parallel_for(range<dimensions> numWorkItems,
+                    id<dimensions> workItemOffset, kernel syclKernel) {
+    verifySyclKernelInvoc(syclKernel);
+    m_Node.addKernel<DummyFunctor<dimensions>, dimensions, id<dimensions>>(
+        syclKernel.get_info<info::kernel::function_name>(), 0, nullptr,
+        DummyFunctor<dimensions>(), numWorkItems, workItemOffset,
+        syclKernel.get());
+  }
+
+  template <int dimensions>
+  void parallel_for(nd_range<dimensions> ndRange, kernel syclKernel) {
+    verifySyclKernelInvoc(syclKernel);
+    m_Node.addKernel(
+        syclKernel.get_info<info::kernel::function_name>(), 0, nullptr,
+        [](nd_item<dimensions>) {}, ndRange, syclKernel.get());
+  }
+
+  // Note: the kernel invocation methods below are only planned to be added
+  // to the spec as of v1.2.1 rev. 3, despite already being present in SYCL
+  // conformance tests.
+
+  template <typename KernelName, typename KernelType>
+  void single_task(kernel syclKernel, KernelType kernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_single_task<KernelName>(kernelFunc);
+#else
+    cl_kernel clKernel = nullptr;
+    if (!is_host()) {
+      clKernel = syclKernel.get();
+    }
+    using KI = cl::sycl::detail::KernelInfo<KernelName>;
+    m_Node.addKernel(KI::getName(), KI::getNumParams(), &KI::getParamDesc(0),
+                     std::move(kernelFunc), clKernel);
+#endif
+  }
+
+  // The version for a functor kernel.
+  template <typename KernelType>
+  void single_task(kernel syclKernel, KernelType kernelFunc) {
+    single_task<KernelType, KernelType>(syclKernel, kernelFunc);
+  }
+
+  template <typename KernelName, typename KernelType, int dimensions>
+  void parallel_for(range<dimensions> numWorkItems, kernel syclKernel,
+                    KernelType kernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_parallel_for<KernelName, KernelType, dimensions>(kernelFunc);
+#else
+    cl_kernel clKernel = nullptr;
+    if (!is_host()) {
+      clKernel = syclKernel.get();
+    }
+    using KI = cl::sycl::detail::KernelInfo<KernelName>;
+    m_Node
+        .addKernel<KernelType, dimensions, detail::lambda_arg_type<KernelType>>(
+            KI::getName(), KI::getNumParams(), &KI::getParamDesc(0),
+            std::move(kernelFunc), numWorkItems, clKernel);
+#endif
+  }
+
+  // The version for a functor kernel.
+  template <typename KernelType, int dimensions>
+  void parallel_for(range<dimensions> numWorkItems, kernel syclKernel,
+                    KernelType kernelFunc) {
+
+    parallel_for<KernelType, KernelType, dimensions>(numWorkItems, syclKernel,
+                                                     kernelFunc);
+  }
+
+  template <typename KernelName, typename KernelType, int dimensions>
+  void parallel_for(range<dimensions> numWorkItems,
+                    id<dimensions> workItemOffset, kernel syclKernel,
+                    KernelType kernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_parallel_for<KernelName, KernelType, dimensions>(kernelFunc);
+#else
+    cl_kernel clKernel = nullptr;
+    if (!is_host()) {
+      clKernel = syclKernel.get();
+    }
+    using KI = cl::sycl::detail::KernelInfo<KernelName>;
+    m_Node
+        .addKernel<KernelType, dimensions, detail::lambda_arg_type<KernelType>>(
+            KI::getName(), KI::getNumParams(), &KI::getParamDesc(0),
+            std::move(kernelFunc), numWorkItems, workItemOffset, clKernel);
+#endif
+  }
+
+  template <typename KernelName, typename KernelType, int dimensions>
+  void parallel_for(nd_range<dimensions> ndRange, kernel syclKernel,
+                    KernelType kernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_parallel_for<KernelName, KernelType, dimensions>(kernelFunc);
+#else
+    cl_kernel clKernel = nullptr;
+    if (!is_host()) {
+      clKernel = syclKernel.get();
+    }
+    using KI = cl::sycl::detail::KernelInfo<KernelName>;
+    m_Node.addKernel<KernelType, dimensions>(
+        KI::getName(), KI::getNumParams(), &KI::getParamDesc(0),
+        std::move(kernelFunc), ndRange, clKernel);
+#endif
+  }
+
+  // The version for a functor kernel.
+  template <typename KernelType, int dimensions>
+  void parallel_for(nd_range<dimensions> ndRange, kernel syclKernel,
+                    KernelType kernelFunc) {
+    parallel_for<KernelType, KernelType, dimensions>(ndRange, syclKernel,
+                                                     kernelFunc);
+  }
+
+  // template <typename KernelName, typename WorkgroupFunctionType, int
+  // dimensions>
+  // void parallel_for_work_group(range<dimensions> num_work_groups, kernel
+  // syclKernel, WorkgroupFunctionType kernelFunc);
+
+  // template <typename KernelName, typename WorkgroupFunctionType, int
+  // dimensions>
+  // void parallel_for_work_group(range<dimensions> num_work_groups,
+  // range<dimensions> work_group_size, kernel syclKernel, WorkgroupFunctionType
+  // kernelFunc);
+
+  // Explicit copy operations API
+  template <typename T_src, typename T_dest, int dim, access::mode mode,
+            access::target tgt,
+            access::placeholder isPlaceholder = access::placeholder::false_t>
+  typename std::enable_if<(tgt == access::target::global_buffer ||
+                           tgt == access::target::constant_buffer),
+                          void>::type
+  copy(accessor<T_src, dim, mode, tgt, isPlaceholder> src,
+       shared_ptr_class<T_dest> dest) {
+    range<dim> Range =
+        getAccessorRangeHelper<T_src, dim, mode, tgt,
+                               isPlaceholder>::getAccessorRange(src);
+    // TODO use buffer_allocator when it is possible
+    buffer<T_src, dim, std::allocator<T_src>> Buffer(
+        (shared_ptr_class<T_src>)dest, Range,
+        {property::buffer::use_host_ptr()});
+    accessor<T_src, dim, access::mode::write, access::target::global_buffer,
+             access::placeholder::false_t>
+        DestAcc(Buffer, *this);
+    copy(src, DestAcc);
+  }
+
+  template <typename T_src, typename T_dest, int dim, access::mode mode,
+            access::target tgt,
+            access::placeholder isPlaceholder = access::placeholder::false_t>
+  typename std::enable_if<(tgt == access::target::global_buffer ||
+                           tgt == access::target::constant_buffer),
+                          void>::type
+  copy(shared_ptr_class<T_src> src,
+       accessor<T_dest, dim, mode, tgt, isPlaceholder> dest) {
+    range<dim> Range =
+        getAccessorRangeHelper<T_dest, dim, mode, tgt,
+                               isPlaceholder>::getAccessorRange(dest);
+    // TODO use buffer_allocator when it is possible
+    buffer<T_dest, dim, std::allocator<T_dest>> Buffer(
+        (shared_ptr_class<T_dest>)src, Range,
+        {property::buffer::use_host_ptr()});
+    accessor<T_dest, dim, access::mode::read, access::target::global_buffer,
+             access::placeholder::false_t>
+        SrcAcc(Buffer, *this);
+    copy(SrcAcc, dest);
+  }
+
+  template <typename T_src, typename T_dest, int dim, access::mode mode,
+            access::target tgt,
+            access::placeholder isPlaceholder = access::placeholder::false_t>
+  typename std::enable_if<(tgt == access::target::global_buffer ||
+                           tgt == access::target::constant_buffer),
+                          void>::type
+  copy(accessor<T_src, dim, mode, tgt, isPlaceholder> src, T_dest *dest) {
+    range<dim> Range =
+        getAccessorRangeHelper<T_src, dim, mode, tgt,
+                               isPlaceholder>::getAccessorRange(src);
+    // TODO use buffer_allocator when it is possible
+    buffer<T_src, dim, std::allocator<T_src>> Buffer(
+        (T_src *)dest, Range, {property::buffer::use_host_ptr()});
+    accessor<T_src, dim, access::mode::write, access::target::global_buffer,
+             access::placeholder::false_t>
+        DestAcc(Buffer, *this);
+    copy(src, DestAcc);
+  }
+
+  template <typename T_src, typename T_dest, int dim, access::mode mode,
+            access::target tgt,
+            access::placeholder isPlaceholder = access::placeholder::false_t>
+  typename std::enable_if<(tgt == access::target::global_buffer ||
+                           tgt == access::target::constant_buffer),
+                          void>::type
+  copy(const T_src *src, accessor<T_dest, dim, mode, tgt, isPlaceholder> dest) {
+    range<dim> Range =
+        getAccessorRangeHelper<T_dest, dim, mode, tgt,
+                               isPlaceholder>::getAccessorRange(dest);
+    // TODO use buffer_allocator when it is possible
+    buffer<T_dest, dim, std::allocator<T_dest>> Buffer(
+        (T_dest *)src, Range, {property::buffer::use_host_ptr()});
+    accessor<T_dest, dim, access::mode::read, access::target::global_buffer,
+             access::placeholder::false_t>
+        SrcAcc(Buffer, *this);
+    copy(SrcAcc, dest);
+  }
+
+  template <
+      typename T_src, int dim_src, access::mode mode_src,
+      access::target tgt_src, typename T_dest, int dim_dest,
+      access::mode mode_dest, access::target tgt_dest,
+      access::placeholder isPlaceholder_src = access::placeholder::false_t,
+      access::placeholder isPlaceholder_dest = access::placeholder::false_t>
+  typename std::enable_if<((tgt_src == access::target::global_buffer ||
+                            tgt_src == access::target::constant_buffer) &&
+                           (tgt_dest == access::target::global_buffer ||
+                            tgt_dest == access::target::constant_buffer)),
+                          void>::type
+  copy(accessor<T_src, dim_src, mode_src, tgt_src, isPlaceholder_src> src,
+       accessor<T_dest, dim_dest, mode_dest, tgt_dest, isPlaceholder_dest>
+           dest) {
+    if (isHost) {
+      range<dim_src> Range =
+          getAccessorRangeHelper<T_src, dim_src, mode_src, tgt_src,
+                                 isPlaceholder_src>::getAccessorRange(src);
+      parallel_for<
+        class __copy<
+          T_src, dim_src, mode_src, tgt_src, T_dest, dim_dest, mode_dest,
+          tgt_dest, isPlaceholder_src, isPlaceholder_dest>
+        >(Range, [=](id<dim_src> Index) {
+        dest[Index] = src[Index];
+      });
+    } else {
+#ifndef __SYCL_DEVICE_ONLY__
+      m_Node.addExplicitMemOp<>(src, dest);
+#endif
+    }
+    finalize();
+    // force wait.
+  }
+
+  template <typename T, int dim, access::mode mode, access::target tgt,
+            access::placeholder isPlaceholder = access::placeholder::false_t>
+  typename std::enable_if<(tgt == access::target::global_buffer ||
+                           tgt == access::target::constant_buffer),
+                          void>::type
+  update_host(accessor<T, dim, mode, tgt, isPlaceholder> acc) {
+#ifndef __SYCL_DEVICE_ONLY__
+    assert(!m_Finalized && "The final event of this handler must not be set.");
+    event *Event = new event;
+    simple_scheduler::Scheduler::getInstance().updateHost(acc, *Event);
+    m_Finalized.reset(Event);
+#endif
+  }
+
+  template <typename T, int dim, access::mode mode, access::target tgt,
+            access::placeholder isPlaceholder = access::placeholder::false_t>
+  typename std::enable_if<(tgt == access::target::global_buffer ||
+                           tgt == access::target::constant_buffer),
+                          void>::type
+  fill(accessor<T, dim, mode, tgt, isPlaceholder> dest, const T &src) {
+    // TODO add check:T must be an integral scalar value or a SYCL vector type
+    if (!isHost && dim == 1) {
+#ifndef __SYCL_DEVICE_ONLY__
+      m_Node.addExplicitMemOp<>(dest, src);
+#endif
+    } else {
+      // TODO multidimensional case with offset is not supported.
+      // Fix it when parallel_for with offset is implemented
+      range<dim> Range =
+          getAccessorRangeHelper<T, dim, mode, tgt,
+                                 isPlaceholder>::getAccessorRange(dest);
+      parallel_for<class __fill<T, dim, mode, tgt, isPlaceholder>>(Range,
+        [=](id<dim> Index) {
+        dest[Index] = src;
+      });
+    }
+  }
+};
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/id.hpp b/sycl/include/CL/sycl/id.hpp
new file mode 100644
index 000000000000..326e0021f789
--- /dev/null
+++ b/sycl/include/CL/sycl/id.hpp
@@ -0,0 +1,571 @@
+//==----------- id.hpp --- SYCL iteration id -------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/array.hpp>
+#include <CL/sycl/item.hpp>
+#include <CL/sycl/range.hpp>
+
+namespace cl {
+namespace sycl {
+template <int dimensions> class range;
+template <int dimensions = 1> struct id : public detail::array<dimensions> {
+public:
+  using base = detail::array<dimensions>;
+  INLINE_IF_DEVICE id() = default;
+
+  /* The following constructor is only available in the id struct
+   * specialization where: dimensions==1 */
+  template <int N = dimensions>
+  id(typename std::enable_if<(N == 1), size_t>::type dim0) : base(dim0) {}
+
+  template <int N = dimensions>
+  id(typename std::enable_if<(N == 1), const range<dimensions> &>::type
+         range_size)
+      : base(range_size.get(0)) {}
+
+  template <int N = dimensions>
+  id(typename std::enable_if<(N == 1), const item<dimensions> &>::type item)
+      : base(item.get_id(0)) {}
+
+  /* The following constructor is only available in the id struct
+   * specialization where: dimensions==2 */
+  template <int N = dimensions>
+  id(typename std::enable_if<(N == 2), size_t>::type dim0, size_t dim1)
+      : base(dim0, dim1) {}
+
+  template <int N = dimensions>
+  id(typename std::enable_if<(N == 2), const range<dimensions> &>::type
+         range_size)
+      : base(range_size.get(0), range_size.get(1)) {}
+
+  template <int N = dimensions>
+  id(typename std::enable_if<(N == 2), const item<dimensions> &>::type item)
+      : base(item.get_id(0), item.get_id(1)) {}
+
+  /* The following constructor is only available in the id struct
+   * specialization where: dimensions==3 */
+  template <int N = dimensions>
+  id(typename std::enable_if<(N == 3), size_t>::type dim0, size_t dim1,
+     size_t dim2)
+      : base(dim0, dim1, dim2) {}
+
+  template <int N = dimensions>
+  id(typename std::enable_if<(N == 3), const range<dimensions> &>::type
+         range_size)
+      : base(range_size.get(0), range_size.get(1), range_size.get(2)) {}
+
+  template <int N = dimensions>
+  id(typename std::enable_if<(N == 3), const item<dimensions> &>::type item)
+      : base(item.get_id(0), item.get_id(1), item.get_id(2)) {}
+
+  explicit operator range<dimensions>() const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result[i] = this->get(i);
+    }
+    return result;
+  }
+
+  // OP is: +, -, *, /, %, <<, >>, &, |, ^, &&, ||, <, >, <=, >=
+  id<dimensions> operator+(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] + rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator-(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] - rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator*(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] * rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator/(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] / rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator%(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] % rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator<<(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] << rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator>>(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] >> rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator&(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] & rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator|(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] | rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator^(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] ^ rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator&&(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] && rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator||(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] || rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator<(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] < rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator>(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] > rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator<=(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] <= rhs.common_array[i];
+    }
+    return result;
+  }
+  id<dimensions> operator>=(const id<dimensions> &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] >= rhs.common_array[i];
+    }
+    return result;
+  }
+
+  // OP is: +, -, *, /, %, <<, >>, &, |, ^, &&, ||, <, >, <=, >=
+  id<dimensions> operator+(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] + rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator-(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] - rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator*(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] * rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator/(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] / rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator%(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] % rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator<<(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] << rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator>>(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] >> rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator&(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] & rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator|(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] | rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator^(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] ^ rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator&&(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] && rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator||(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] || rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator<(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] < rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator>(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] > rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator<=(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] <= rhs;
+    }
+    return result;
+  }
+  id<dimensions> operator>=(const size_t &rhs) const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] >= rhs;
+    }
+    return result;
+  }
+
+  // OP is: +=, -=, *=, /=, %=, <<=, >>=, &=, |=, ^=
+  id<dimensions> &operator+=(const id<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] += rhs[i];
+    }
+    return *this;
+  }
+  id<dimensions> &operator-=(const id<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] -= rhs.common_array[i];
+    }
+    return *this;
+  }
+  id<dimensions> &operator*=(const id<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] *= rhs.common_array[i];
+    }
+    return *this;
+  }
+  id<dimensions> &operator/=(const id<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] /= rhs.common_array[i];
+    }
+    return *this;
+  }
+  id<dimensions> &operator%=(const id<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] %= rhs.common_array[i];
+    }
+    return *this;
+  }
+  id<dimensions> &operator<<=(const id<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] <<= rhs.common_array[i];
+    }
+    return *this;
+  }
+  id<dimensions> &operator>>=(const id<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] >>= rhs.common_array[i];
+    }
+    return *this;
+  }
+  id<dimensions> &operator&=(const id<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] &= rhs.common_array[i];
+    }
+    return *this;
+  }
+  id<dimensions> &operator|=(const id<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] |= rhs.common_array[i];
+    }
+    return *this;
+  }
+  id<dimensions> &operator^=(const id<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] ^= rhs.common_array[i];
+    }
+    return *this;
+  }
+
+  // OP is: +=, -=, *=, /=, %=, <<=, >>=, &=, |=, ^=
+  id<dimensions> &operator+=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] += rhs;
+    }
+    return *this;
+  }
+  id<dimensions> &operator-=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] -= rhs;
+    }
+    return *this;
+  }
+  id<dimensions> &operator*=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] *= rhs;
+    }
+    return *this;
+  }
+  id<dimensions> &operator/=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] /= rhs;
+    }
+    return *this;
+  }
+  id<dimensions> &operator%=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] %= rhs;
+    }
+    return *this;
+  }
+  id<dimensions> &operator<<=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] <<= rhs;
+    }
+    return *this;
+  }
+  id<dimensions> &operator>>=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] >>= rhs;
+    }
+    return *this;
+  }
+  id<dimensions> &operator&=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] &= rhs;
+    }
+    return *this;
+  }
+  id<dimensions> &operator|=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] |= rhs;
+    }
+    return *this;
+  }
+  id<dimensions> &operator^=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] ^= rhs;
+    }
+    return *this;
+  }
+
+  // OP is: +, -, *, /, %, <<, >>, &, |, ^, <, >, <=, >=, &&, ||
+  friend id<dimensions> operator+(const size_t &lhs,
+                                  const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs + rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator-(const size_t &lhs,
+                                  const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs - rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator*(const size_t &lhs,
+                                  const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs * rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator/(const size_t &lhs,
+                                  const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs / rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator%(const size_t &lhs,
+                                  const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs % rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator<<(const size_t &lhs,
+                                   const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs << rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator>>(const size_t &lhs,
+                                   const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs >> rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator&(const size_t &lhs,
+                                  const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs & rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator|(const size_t &lhs,
+                                  const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs | rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator^(const size_t &lhs,
+                                  const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs ^ rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator<(const size_t &lhs,
+                                  const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs < rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator>(const size_t &lhs,
+                                  const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs > rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator<=(const size_t &lhs,
+                                   const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs <= rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator>=(const size_t &lhs,
+                                   const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs >= rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator&&(const size_t &lhs,
+                                   const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs && rhs.common_array[i];
+    }
+    return result;
+  }
+  friend id<dimensions> operator||(const size_t &lhs,
+                                   const id<dimensions> &rhs) {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs || rhs.common_array[i];
+    }
+    return result;
+  }
+};
+
+namespace detail {
+template <int dimensions> INLINE_IF_DEVICE
+size_t getOffsetForId(range<dimensions> Range, id<dimensions> Id,
+                      id<dimensions> Offset) {
+  size_t offset = 0;
+  for (int i = 0; i < dimensions; ++i)
+    offset = offset * Range[i] + Offset[i] + Id[i];
+  return offset;
+}
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/image.hpp b/sycl/include/CL/sycl/image.hpp
new file mode 100644
index 000000000000..fdbcdd1723a0
--- /dev/null
+++ b/sycl/include/CL/sycl/image.hpp
@@ -0,0 +1,158 @@
+//==------------ image.hpp -------------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/image_impl.hpp>
+#include <cstddef>
+
+namespace cl {
+namespace sycl {
+
+using byte = unsigned char;
+
+using image_allocator = std::allocator<byte>;
+
+template <int dimentions> class range;
+
+template <int dimensions = 1,
+          typename AllocatorT = cl::sycl::image_allocator>
+class image {
+public:
+  image(image_channel_order order, image_channel_type type,
+        const range<dimensions> &range, const property_list &propList = {}) {
+    impl = std::make_shared<detail::image_impl<dimensions, AllocatorT>>(
+        order, type, range, propList);
+  }
+
+  //image(image_channel_order order, image_channel_type type,
+        //const range<dimensions> &range, AllocatorT allocator,
+        //const property_list &propList = {});
+
+  /* Available only when: dimensions > 1 */
+  //image(image_channel_order order, image_channel_type type,
+        //const range<dimensions> &range, const range<dimensions - 1> &pitch,
+        //const property_list &propList = {});
+
+  /* Available only when: dimensions > 1 */
+  //image(image_channel_order order, image_channel_type type,
+        //const range<dimensions> &range, const range<dimensions - 1> &pitch,
+        //AllocatorT allocator, const property_list &propList = {});
+
+  //image(void *hostPointer, image_channel_order order, image_channel_type type,
+        //const range<dimensions> &range, const property_list &propList = {});
+
+  //image(void *hostPointer, image_channel_order order, image_channel_type type,
+        //const range<dimensions> &range, AllocatorT allocator,
+        //const property_list &propList = {});
+
+  //image(const void *hostPointer, image_channel_order order,
+        //image_channel_type type, const range<dimensions> &range,
+        //const property_list &propList = {});
+
+  //image(const void *hostPointer, image_channel_order order,
+        //image_channel_type type, const range<dimensions> &range,
+        //AllocatorT allocator, const property_list &propList = {});
+
+  /* Available only when: dimensions > 1 */
+  //image(void *hostPointer, image_channel_order order, image_channel_type type,
+        //const range<dimensions> &range, range<dimensions - 1> &pitch,
+        //const property_list &propList = {});
+
+  /* Available only when: dimensions > 1 */
+  //image(void *hostPointer, image_channel_order order, image_channel_type type,
+        //const range<dimensions> &range, range<dimensions - 1> &pitch,
+        //AllocatorT allocator, const property_list &propList = {});
+
+  //image(shared_ptr_class<void> &hostPointer, image_channel_order order,
+        //image_channel_type type, const range<dimensions> &range,
+        //const property_list &propList = {});
+
+  //image(shared_ptr_class<void> &hostPointer, image_channel_order order,
+        //image_channel_type type, const range<dimensions> &range,
+        //AllocatorT allocator, const property_list &propList = {});
+
+  /* Available only when: dimensions > 1 */
+  //image(shared_ptr_class<void> &hostPointer, image_channel_order order,
+        //image_channel_type type, const range<dimensions> &range,
+        //const range<dimensions - 1> &pitch, const property_list &propList = {});
+
+  /* Available only when: dimensions > 1 */
+  //image(shared_ptr_class<void> &hostPointer, image_channel_order order,
+        //image_channel_type type, const range<dimensions> &range,
+        //const range<dimensions - 1> &pitch, AllocatorT allocator,
+        //const property_list &propList = {});
+
+  image(cl_mem clMemObject, const context &syclContext,
+        event availableEvent = {});
+
+  image(const image &rhs) = default;
+
+  image(image &&rhs) = default;
+
+  image &operator=(const image &rhs) = default;
+
+  image &operator=(image &&rhs) = default;
+
+  ~image() = default;
+
+  bool operator==(const image &rhs) const { return impl == rhs.impl; }
+
+  bool operator!=(const image &rhs) const { return !(*this == rhs); }
+
+  /* -- common interface members -- */
+
+  /* -- property interface members -- */
+
+  range<dimensions> get_range() const { return impl->get_range(); }
+
+  /* Available only when: dimensions > 1 */
+  range<dimensions - 1> get_pitch() const { return impl->get_pitch(); }
+
+  size_t get_size() const { return impl->get_size(); }
+
+  size_t get_count() const { return impl->get_count(); }
+
+  AllocatorT get_allocator() const { return impl->get_allocator(); }
+
+  template <typename dataT, access::mode accessMode>
+  accessor<dataT, dimensions, accessMode, access::target::image>
+  get_access(handler &commandGroupHandler) {
+    return impl->template get_access<dataT, accessMode>();
+  }
+
+  template <typename dataT, access::mode accessMode>
+  accessor<dataT, dimensions, accessMode, access::target::host_image>
+  get_access() {
+    return impl->template get_access<dataT, accessMode>();
+  }
+
+  //template <typename Destination = std::nullptr_t>
+  //void set_final_data(Destination finalData = std::nullptr);
+
+  void set_write_back(bool flag = true) { impl->set_write_back(flag); }
+
+private:
+  shared_ptr_class<detail::image_impl<dimensions, AllocatorT>> impl;
+  template <class Obj>
+  friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject);
+};
+
+} // namespace sycl
+} // namespace cl
+
+namespace std {
+template <int dimensions, typename AllocatorT>
+struct hash<cl::sycl::image<dimensions, AllocatorT>> {
+  size_t operator()(const cl::sycl::image<dimensions, AllocatorT> &i) const {
+    return hash<std::shared_ptr<
+        cl::sycl::detail::image_impl<dimensions, AllocatorT>>>()(i.impl);
+  }
+};
+} // namespace std
diff --git a/sycl/include/CL/sycl/info/info_desc.hpp b/sycl/include/CL/sycl/info/info_desc.hpp
new file mode 100644
index 000000000000..264f3f4340ee
--- /dev/null
+++ b/sycl/include/CL/sycl/info/info_desc.hpp
@@ -0,0 +1,382 @@
+//==------- info_desc.hpp - SYCL information descriptors -------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/id.hpp>
+
+namespace cl {
+namespace sycl {
+
+class program;
+class device;
+class platform;
+
+namespace info {
+
+// Information descriptors
+// A.1 Platform information descriptors
+enum class platform : cl_platform_info {
+  profile = CL_PLATFORM_PROFILE,
+  version = CL_PLATFORM_VERSION,
+  name = CL_PLATFORM_NAME,
+  vendor = CL_PLATFORM_VENDOR,
+  extensions = CL_PLATFORM_EXTENSIONS
+};
+
+// A.2 Context information desctiptors
+enum class context : cl_context_info {
+  reference_count = CL_CONTEXT_REFERENCE_COUNT,
+  platform = CL_CONTEXT_PLATFORM,
+  devices = CL_CONTEXT_DEVICES,
+};
+
+// A.3 Device information descriptors
+enum class device : cl_device_info {
+  device_type = CL_DEVICE_TYPE,
+  vendor_id = CL_DEVICE_VENDOR_ID,
+  max_compute_units = CL_DEVICE_MAX_COMPUTE_UNITS,
+  max_work_item_dimensions = CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
+  max_work_item_sizes = CL_DEVICE_MAX_WORK_ITEM_SIZES,
+  max_work_group_size = CL_DEVICE_MAX_WORK_GROUP_SIZE,
+
+  preferred_vector_width_char = CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,
+  preferred_vector_width_short = CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,
+  preferred_vector_width_int = CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,
+  preferred_vector_width_long = CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,
+  preferred_vector_width_float = CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
+  preferred_vector_width_double = CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
+  preferred_vector_width_half = CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF,
+
+  native_vector_width_char = CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR,
+  native_vector_width_short = CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT,
+  native_vector_width_int = CL_DEVICE_NATIVE_VECTOR_WIDTH_INT,
+  native_vector_width_long = CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG,
+  native_vector_width_float = CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT,
+  native_vector_width_double = CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE,
+  native_vector_width_half = CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF,
+
+  max_clock_frequency = CL_DEVICE_MAX_CLOCK_FREQUENCY,
+  address_bits = CL_DEVICE_ADDRESS_BITS,
+  max_mem_alloc_size = CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+  image_support = CL_DEVICE_IMAGE_SUPPORT,
+  max_read_image_args = CL_DEVICE_MAX_READ_IMAGE_ARGS,
+  max_write_image_args = CL_DEVICE_MAX_WRITE_IMAGE_ARGS,
+  image2d_max_width = CL_DEVICE_IMAGE2D_MAX_WIDTH,
+  image2d_max_height = CL_DEVICE_IMAGE2D_MAX_HEIGHT,
+  image3d_max_width = CL_DEVICE_IMAGE3D_MAX_WIDTH,
+  image3d_max_height = CL_DEVICE_IMAGE3D_MAX_HEIGHT,
+  image3d_max_depth = CL_DEVICE_IMAGE3D_MAX_DEPTH,
+  image_max_buffer_size = CL_DEVICE_IMAGE_MAX_BUFFER_SIZE,
+  image_max_array_size = CL_DEVICE_IMAGE_MAX_ARRAY_SIZE,
+  max_samplers = CL_DEVICE_MAX_SAMPLERS,
+  max_parameter_size = CL_DEVICE_MAX_PARAMETER_SIZE,
+  mem_base_addr_align = CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+  half_fp_config = CL_DEVICE_HALF_FP_CONFIG,
+  single_fp_config = CL_DEVICE_SINGLE_FP_CONFIG,
+  double_fp_config = CL_DEVICE_DOUBLE_FP_CONFIG,
+  global_mem_cache_type = CL_DEVICE_GLOBAL_MEM_CACHE_TYPE,
+  global_mem_cache_line_size = CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE,
+  global_mem_cache_size = CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
+  global_mem_size = CL_DEVICE_GLOBAL_MEM_SIZE,
+  max_constant_buffer_size = CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
+  max_constant_args = CL_DEVICE_MAX_CONSTANT_ARGS,
+  local_mem_type = CL_DEVICE_LOCAL_MEM_TYPE,
+  local_mem_size = CL_DEVICE_LOCAL_MEM_SIZE,
+  error_correction_support = CL_DEVICE_ERROR_CORRECTION_SUPPORT,
+  host_unified_memory = CL_DEVICE_HOST_UNIFIED_MEMORY,
+  profiling_timer_resolution = CL_DEVICE_PROFILING_TIMER_RESOLUTION,
+  is_endian_little = CL_DEVICE_ENDIAN_LITTLE,
+  is_available = CL_DEVICE_AVAILABLE,
+  is_compiler_available = CL_DEVICE_COMPILER_AVAILABLE,
+  is_linker_available = CL_DEVICE_LINKER_AVAILABLE,
+  execution_capabilities = CL_DEVICE_EXECUTION_CAPABILITIES,
+  queue_profiling = CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES,
+  built_in_kernels = CL_DEVICE_BUILT_IN_KERNELS,
+  platform = CL_DEVICE_PLATFORM,
+  name = CL_DEVICE_NAME,
+  vendor = CL_DEVICE_VENDOR,
+  driver_version = CL_DRIVER_VERSION,
+  profile = CL_DEVICE_PROFILE,
+  version = CL_DEVICE_VERSION,
+  opencl_c_version = CL_DEVICE_OPENCL_C_VERSION,
+  extensions = CL_DEVICE_EXTENSIONS,
+  printf_buffer_size = CL_DEVICE_PRINTF_BUFFER_SIZE,
+  preferred_interop_user_sync = CL_DEVICE_PREFERRED_INTEROP_USER_SYNC,
+  parent_device = CL_DEVICE_PARENT_DEVICE,
+  partition_max_sub_devices = CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
+  partition_properties = CL_DEVICE_PARTITION_PROPERTIES,
+  partition_affinity_domains = CL_DEVICE_PARTITION_AFFINITY_DOMAIN,
+  partition_type_affinity_domain = CL_DEVICE_PARTITION_TYPE,
+  reference_count = CL_DEVICE_REFERENCE_COUNT,
+  max_num_sub_groups = CL_DEVICE_MAX_NUM_SUB_GROUPS,
+  sub_group_independent_forward_progress =
+      CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS,
+  partition_type_property
+};
+
+enum class device_type : cl_device_type {
+  cpu = CL_DEVICE_TYPE_CPU,
+  gpu = CL_DEVICE_TYPE_GPU,
+  accelerator = CL_DEVICE_TYPE_ACCELERATOR,
+  custom = CL_DEVICE_TYPE_CUSTOM,
+  automatic,
+  host,
+  all = CL_DEVICE_TYPE_ALL
+};
+
+enum class partition_property : cl_device_partition_property {
+  partition_equally = CL_DEVICE_PARTITION_EQUALLY,
+  partition_by_counts = CL_DEVICE_PARTITION_BY_COUNTS,
+  partition_by_affinity_domain = CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN,
+  no_partition
+};
+
+enum class partition_affinity_domain : cl_device_affinity_domain {
+  not_applicable = 0,
+  numa = CL_DEVICE_AFFINITY_DOMAIN_NUMA,
+  L4_cache = CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE,
+  L3_cache = CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE,
+  L2_cache = CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE,
+  L1_cache = CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE,
+  next_partitionable = CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE
+};
+
+enum class local_mem_type : int { none, local, global };
+
+enum class fp_config : cl_device_fp_config {
+  denorm = CL_FP_DENORM,
+  inf_nan = CL_FP_INF_NAN,
+  round_to_nearest = CL_FP_ROUND_TO_NEAREST,
+  round_to_zero = CL_FP_ROUND_TO_ZERO,
+  round_to_inf = CL_FP_ROUND_TO_INF,
+  fma = CL_FP_FMA,
+  correctly_rounded_divide_sqrt,
+  soft_float
+};
+
+enum class global_mem_cache_type : int { none, read_only, write_only };
+
+enum class execution_capability : unsigned int {
+  exec_kernel,
+  exec_native_kernel
+};
+
+// A.4 Queue information desctiptors
+enum class queue : cl_command_queue_info {
+  context = CL_QUEUE_CONTEXT,
+  device = CL_QUEUE_DEVICE,
+  reference_count = CL_QUEUE_REFERENCE_COUNT
+};
+
+// A.5 Kernel information desctiptors
+enum class kernel : cl_kernel_info {
+  function_name = CL_KERNEL_FUNCTION_NAME,
+  num_args = CL_KERNEL_NUM_ARGS,
+  context = CL_KERNEL_CONTEXT,
+  program = CL_KERNEL_PROGRAM,
+  reference_count = CL_KERNEL_REFERENCE_COUNT,
+  attributes = CL_KERNEL_ATTRIBUTES
+};
+
+enum class kernel_work_group : cl_kernel_work_group_info {
+  global_work_size = CL_KERNEL_GLOBAL_WORK_SIZE,
+  work_group_size = CL_KERNEL_WORK_GROUP_SIZE,
+  compile_work_group_size = CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
+  preferred_work_group_size_multiple =
+      CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+  private_mem_size = CL_KERNEL_PRIVATE_MEM_SIZE
+};
+
+enum class kernel_sub_group : cl_kernel_sub_group_info {
+  max_sub_group_size_for_ndrange = CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+  sub_group_count_for_ndrange = CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE,
+  local_size_for_sub_group_count = CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT,
+  max_num_sub_groups = CL_KERNEL_MAX_NUM_SUB_GROUPS,
+  compile_num_sub_groups = CL_KERNEL_COMPILE_NUM_SUB_GROUPS
+};
+
+// A.6 Program information desctiptors
+enum class program : cl_program_info {
+  context = CL_PROGRAM_CONTEXT,
+  devices = CL_PROGRAM_DEVICES,
+  reference_count = CL_PROGRAM_REFERENCE_COUNT
+};
+
+// A.7 Event information desctiptors
+enum class event : cl_event_info {
+  reference_count = CL_EVENT_REFERENCE_COUNT,
+  command_execution_status = CL_EVENT_COMMAND_EXECUTION_STATUS
+};
+
+enum class event_command_status : cl_int {
+  submitted = CL_SUBMITTED,
+  running = CL_RUNNING,
+  complete = CL_COMPLETE
+};
+
+enum class event_profiling : cl_profiling_info {
+  command_submit = CL_PROFILING_COMMAND_SUBMIT,
+  command_start = CL_PROFILING_COMMAND_START,
+  command_end = CL_PROFILING_COMMAND_END
+};
+
+// Provide an alias to the return type for each of the info parameters
+template <typename T, T param> class param_traits {};
+
+#define PARAM_TRAITS_SPEC(param_type, param, ret_type)                         \
+  template <> class param_traits<param_type, param_type::param> {              \
+  public:                                                                      \
+    using return_type = ret_type;                                              \
+  };
+
+#define PARAM_TRAITS_SPEC_WITH_INPUT(param_type, param, ret_type, in_type)     \
+  template <> class param_traits<param_type, param_type::param> {              \
+  public:                                                                      \
+    using return_type = ret_type;                                              \
+    using input_type = in_type;                                                \
+  };
+
+PARAM_TRAITS_SPEC(device, device_type, device_type)
+PARAM_TRAITS_SPEC(device, vendor_id, cl_uint)
+PARAM_TRAITS_SPEC(device, max_compute_units, cl_uint)
+PARAM_TRAITS_SPEC(device, max_work_item_dimensions, cl_uint)
+PARAM_TRAITS_SPEC(device, max_work_item_sizes, id<3>)
+PARAM_TRAITS_SPEC(device, max_work_group_size, size_t)
+PARAM_TRAITS_SPEC(device, preferred_vector_width_char, cl_uint)
+PARAM_TRAITS_SPEC(device, preferred_vector_width_short, cl_uint)
+PARAM_TRAITS_SPEC(device, preferred_vector_width_int, cl_uint)
+PARAM_TRAITS_SPEC(device, preferred_vector_width_long, cl_uint)
+PARAM_TRAITS_SPEC(device, preferred_vector_width_float, cl_uint)
+PARAM_TRAITS_SPEC(device, preferred_vector_width_double, cl_uint)
+PARAM_TRAITS_SPEC(device, preferred_vector_width_half, cl_uint)
+PARAM_TRAITS_SPEC(device, native_vector_width_char, cl_uint)
+PARAM_TRAITS_SPEC(device, native_vector_width_short, cl_uint)
+PARAM_TRAITS_SPEC(device, native_vector_width_int, cl_uint)
+PARAM_TRAITS_SPEC(device, native_vector_width_long, cl_uint)
+PARAM_TRAITS_SPEC(device, native_vector_width_float, cl_uint)
+PARAM_TRAITS_SPEC(device, native_vector_width_double, cl_uint)
+PARAM_TRAITS_SPEC(device, native_vector_width_half, cl_uint)
+PARAM_TRAITS_SPEC(device, max_clock_frequency, cl_uint)
+PARAM_TRAITS_SPEC(device, address_bits, cl_uint)
+PARAM_TRAITS_SPEC(device, max_mem_alloc_size, cl_ulong)
+PARAM_TRAITS_SPEC(device, image_support, bool)
+PARAM_TRAITS_SPEC(device, max_read_image_args, cl_uint)
+PARAM_TRAITS_SPEC(device, max_write_image_args, cl_uint)
+PARAM_TRAITS_SPEC(device, image2d_max_width, size_t)
+PARAM_TRAITS_SPEC(device, image2d_max_height, size_t)
+PARAM_TRAITS_SPEC(device, image3d_max_width, size_t)
+PARAM_TRAITS_SPEC(device, image3d_max_height, size_t)
+PARAM_TRAITS_SPEC(device, image3d_max_depth, size_t)
+PARAM_TRAITS_SPEC(device, image_max_buffer_size, size_t)
+PARAM_TRAITS_SPEC(device, image_max_array_size, size_t)
+PARAM_TRAITS_SPEC(device, max_samplers, cl_uint)
+PARAM_TRAITS_SPEC(device, max_parameter_size, size_t)
+PARAM_TRAITS_SPEC(device, mem_base_addr_align, cl_uint)
+PARAM_TRAITS_SPEC(device, half_fp_config, vector_class<info::fp_config>)
+PARAM_TRAITS_SPEC(device, single_fp_config, vector_class<info::fp_config>)
+PARAM_TRAITS_SPEC(device, double_fp_config, vector_class<info::fp_config>)
+PARAM_TRAITS_SPEC(device, global_mem_cache_type, info::global_mem_cache_type)
+PARAM_TRAITS_SPEC(device, global_mem_cache_line_size, cl_uint)
+PARAM_TRAITS_SPEC(device, global_mem_cache_size, cl_ulong)
+PARAM_TRAITS_SPEC(device, global_mem_size, cl_ulong)
+PARAM_TRAITS_SPEC(device, max_constant_buffer_size, cl_ulong)
+PARAM_TRAITS_SPEC(device, max_constant_args, cl_uint)
+PARAM_TRAITS_SPEC(device, local_mem_type, info::local_mem_type)
+PARAM_TRAITS_SPEC(device, local_mem_size, cl_ulong)
+PARAM_TRAITS_SPEC(device, error_correction_support, bool)
+PARAM_TRAITS_SPEC(device, host_unified_memory, bool)
+PARAM_TRAITS_SPEC(device, profiling_timer_resolution, size_t)
+PARAM_TRAITS_SPEC(device, is_endian_little, bool)
+PARAM_TRAITS_SPEC(device, is_available, bool)
+PARAM_TRAITS_SPEC(device, is_compiler_available, bool)
+PARAM_TRAITS_SPEC(device, is_linker_available, bool)
+PARAM_TRAITS_SPEC(device, execution_capabilities,
+                  vector_class<info::execution_capability>)
+PARAM_TRAITS_SPEC(device, queue_profiling, bool)
+PARAM_TRAITS_SPEC(device, built_in_kernels, vector_class<string_class>)
+PARAM_TRAITS_SPEC(device, platform, cl::sycl::platform)
+PARAM_TRAITS_SPEC(device, name, string_class)
+PARAM_TRAITS_SPEC(device, vendor, string_class)
+PARAM_TRAITS_SPEC(device, driver_version, string_class)
+PARAM_TRAITS_SPEC(device, profile, string_class)
+PARAM_TRAITS_SPEC(device, version, string_class)
+PARAM_TRAITS_SPEC(device, opencl_c_version, string_class)
+PARAM_TRAITS_SPEC(device, extensions, vector_class<string_class>)
+PARAM_TRAITS_SPEC(device, printf_buffer_size, size_t)
+PARAM_TRAITS_SPEC(device, preferred_interop_user_sync, bool)
+PARAM_TRAITS_SPEC(device, parent_device, cl::sycl::device)
+PARAM_TRAITS_SPEC(device, partition_max_sub_devices, cl_uint)
+PARAM_TRAITS_SPEC(device, partition_properties,
+                  vector_class<info::partition_property>)
+PARAM_TRAITS_SPEC(device, partition_affinity_domains,
+                  vector_class<info::partition_affinity_domain>)
+PARAM_TRAITS_SPEC(device, partition_type_property, info::partition_property)
+PARAM_TRAITS_SPEC(device, partition_type_affinity_domain,
+                  info::partition_affinity_domain)
+PARAM_TRAITS_SPEC(device, reference_count, cl_uint)
+PARAM_TRAITS_SPEC(device, max_num_sub_groups, cl_uint)
+PARAM_TRAITS_SPEC(device, sub_group_independent_forward_progress, bool)
+
+PARAM_TRAITS_SPEC(context, reference_count, cl_uint)
+PARAM_TRAITS_SPEC(context, platform, cl::sycl::platform)
+PARAM_TRAITS_SPEC(context, devices, vector_class<cl::sycl::device>)
+
+PARAM_TRAITS_SPEC(event, command_execution_status, event_command_status)
+PARAM_TRAITS_SPEC(event, reference_count, cl_uint)
+
+PARAM_TRAITS_SPEC(event_profiling, command_submit, cl_ulong)
+PARAM_TRAITS_SPEC(event_profiling, command_start, cl_ulong)
+PARAM_TRAITS_SPEC(event_profiling, command_end, cl_ulong)
+
+PARAM_TRAITS_SPEC(kernel, function_name, string_class)
+PARAM_TRAITS_SPEC(kernel, num_args, cl_uint)
+PARAM_TRAITS_SPEC(kernel, reference_count, cl_uint)
+PARAM_TRAITS_SPEC(kernel, attributes, string_class)
+// Shilei: The following two traits are not covered in the current version of
+// CTS (SYCL-1.2.1/master)
+PARAM_TRAITS_SPEC(kernel, context, cl::sycl::context)
+PARAM_TRAITS_SPEC(kernel, program, cl::sycl::program)
+
+PARAM_TRAITS_SPEC(kernel_work_group, compile_work_group_size,
+                  cl::sycl::range<3>)
+PARAM_TRAITS_SPEC(kernel_work_group, global_work_size, cl::sycl::range<3>)
+PARAM_TRAITS_SPEC(kernel_work_group, preferred_work_group_size_multiple, size_t)
+PARAM_TRAITS_SPEC(kernel_work_group, private_mem_size, cl_ulong)
+PARAM_TRAITS_SPEC(kernel_work_group, work_group_size, size_t)
+
+PARAM_TRAITS_SPEC_WITH_INPUT(kernel_sub_group, max_sub_group_size_for_ndrange,
+                             size_t, cl::sycl::range<3>)
+PARAM_TRAITS_SPEC_WITH_INPUT(kernel_sub_group, sub_group_count_for_ndrange,
+                             size_t, cl::sycl::range<3>)
+PARAM_TRAITS_SPEC_WITH_INPUT(kernel_sub_group, local_size_for_sub_group_count,
+                             cl::sycl::range<3>, size_t)
+PARAM_TRAITS_SPEC(kernel_sub_group, max_num_sub_groups, size_t)
+PARAM_TRAITS_SPEC(kernel_sub_group, compile_num_sub_groups, size_t)
+
+PARAM_TRAITS_SPEC(platform, profile, string_class)
+PARAM_TRAITS_SPEC(platform, version, string_class)
+PARAM_TRAITS_SPEC(platform, name, string_class)
+PARAM_TRAITS_SPEC(platform, vendor, string_class)
+PARAM_TRAITS_SPEC(platform, extensions, vector_class<string_class>)
+
+PARAM_TRAITS_SPEC(program, context, cl::sycl::context)
+PARAM_TRAITS_SPEC(program, devices, vector_class<cl::sycl::device>)
+PARAM_TRAITS_SPEC(program, reference_count, cl_uint)
+
+PARAM_TRAITS_SPEC(queue, reference_count, cl_uint)
+PARAM_TRAITS_SPEC(queue, context, cl::sycl::context)
+PARAM_TRAITS_SPEC(queue, device, cl::sycl::device)
+
+#undef PARAM_TRAITS_SPEC
+
+} // namespace info
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/intel/sub_group.hpp b/sycl/include/CL/sycl/intel/sub_group.hpp
new file mode 100644
index 000000000000..7897b0749f6a
--- /dev/null
+++ b/sycl/include/CL/sycl/intel/sub_group.hpp
@@ -0,0 +1,428 @@
+//==----------- sub_group.hpp --- SYCL sub-group ---------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/id.hpp>
+#include <CL/sycl/range.hpp>
+#include <CL/sycl/types.hpp>
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define __NOEXCEPT noexcept
+namespace cl {
+namespace __spirv {
+extern size_t BuiltInSubgroupLocalInvocationId() __NOEXCEPT;
+extern size_t BuiltInSubgroupSize() __NOEXCEPT;
+extern size_t BuiltInSubgroupMaxSize() __NOEXCEPT;
+extern size_t BuiltInSubgroupId() __NOEXCEPT;
+extern size_t BuiltInNumSubgroups() __NOEXCEPT;
+extern size_t BuiltInNumEnqueuedSubgroups() __NOEXCEPT;
+} // namespace __spirv
+} // namespace cl
+
+// TODO: rework to use SPIRV
+typedef uint uint2 __attribute__((ext_vector_type(2)));
+typedef uint uint3 __attribute__((ext_vector_type(3)));
+typedef uint uint4 __attribute__((ext_vector_type(4)));
+typedef uint uint8 __attribute__((ext_vector_type(8)));
+typedef ushort ushort2 __attribute__((ext_vector_type(2)));
+typedef ushort ushort3 __attribute__((ext_vector_type(3)));
+typedef ushort ushort4 __attribute__((ext_vector_type(4)));
+typedef ushort ushort8 __attribute__((ext_vector_type(8)));
+size_t get_sub_group_local_id();      // BuiltInSubgroupLocalInvocationId
+size_t get_sub_group_size();          // BuiltInSubgroupSize
+size_t get_max_sub_group_size();      // BuiltInSubgroupMaxSize
+size_t get_sub_group_id();            // BuiltInSubgroupId
+size_t get_num_sub_groups();          // BuiltInNumSubgroups
+size_t get_enqueued_num_sub_groups(); // BuiltInNumEnqueuedSubgroups
+int sub_group_any(int);
+int sub_group_all(int);
+int sub_group_broadcast(int x, uint sub_grou_local_id);
+int sub_group_reduce_min(int x);
+int sub_group_reduce_max(int x);
+int sub_group_reduce_add(int x);
+int sub_group_scan_exclusive_add(int x);
+int sub_group_scan_exclusive_max(int x);
+int sub_group_scan_exclusive_min(int x);
+int sub_group_scan_inclusive_add(int x);
+int sub_group_scan_inclusive_max(int x);
+int sub_group_scan_inclusive_min(int x);
+int intel_sub_group_shuffle(int data, uint c);
+int intel_sub_group_shuffle_up(int prev, int cur, uint c);
+int intel_sub_group_shuffle_down(int cur, int next, uint c);
+int intel_sub_group_shuffle_xor(int data, uint c);
+uint intel_sub_group_block_read(const __global uint *p);
+uint2 intel_sub_group_block_read2(const __global uint *p);
+uint4 intel_sub_group_block_read4(const __global uint *p);
+uint8 intel_sub_group_block_read8(const __global uint *p);
+void intel_sub_group_block_write(__global uint *p, uint data);
+void intel_sub_group_block_write2(__global uint *p, uint2 data);
+void intel_sub_group_block_write4(__global uint *p, uint4 data);
+void intel_sub_group_block_write8(__global uint *p, uint8 data);
+
+ushort intel_sub_group_block_read_us(const __global ushort *p);
+ushort2 intel_sub_group_block_read_us2(const __global ushort *p);
+ushort4 intel_sub_group_block_read_us4(const __global ushort *p);
+ushort8 intel_sub_group_block_read_us8(const __global ushort *p);
+void intel_sub_group_block_write_us(__global ushort *p, ushort data);
+void intel_sub_group_block_write_us2(__global ushort *p, ushort2 data);
+void intel_sub_group_block_write_us4(__global ushort *p, ushort4 data);
+void intel_sub_group_block_write_us8(__global ushort *p, ushort8 data);
+void sub_group_barrier(cl::sycl::detail::cl_mem_fence_flags flags);
+
+namespace cl {
+namespace sycl {
+template <typename T, access::address_space Space> class multi_ptr;
+namespace intel {
+
+enum class Operation { exclusive_scan, inclusive_scan, reduce };
+
+struct minimum {
+  Operation o;
+  minimum(Operation op) : o(op) {}
+  template <typename T> T operator()(T x) {
+    switch (o) {
+    case Operation::exclusive_scan: {
+      return sub_group_scan_exclusive_min(x);
+    }
+    case Operation::inclusive_scan: {
+      return sub_group_scan_inclusive_min(x);
+    }
+    case Operation::reduce: {
+      return sub_group_reduce_min(x);
+    }
+    }
+  }
+};
+
+struct maximum {
+  Operation o;
+  maximum(Operation op) : o(op) {}
+  template <typename T> T operator()(T x) {
+    switch (o) {
+    case Operation::exclusive_scan: {
+      return sub_group_scan_exclusive_max(x);
+    }
+    case Operation::inclusive_scan: {
+      return sub_group_scan_inclusive_max(x);
+    }
+    case Operation::reduce: {
+      return sub_group_reduce_max(x);
+    }
+    }
+  }
+};
+
+struct plus {
+  Operation o;
+  plus(Operation op) : o(op) {}
+  template <typename T> T operator()(T x) {
+    switch (o) {
+    case Operation::exclusive_scan: {
+      return sub_group_scan_exclusive_add(x);
+    }
+    case Operation::inclusive_scan: {
+      return sub_group_scan_inclusive_add(x);
+    }
+    case Operation::reduce: {
+      return sub_group_reduce_add(x);
+    }
+    }
+  }
+};
+struct sub_group {
+  /* --- common interface members --- */
+
+  id<1> get_local_id() const {
+    return get_sub_group_local_id(); //*cl::__spirv::BuiltInSubgroupLocalInvocationId();
+  }
+  range<1> get_local_range() const {
+    return get_sub_group_size(); // cl::__spirv::BuiltInSubgroupSize();
+  }
+
+  range<1> get_max_local_range() const {
+    return get_max_sub_group_size(); // cl::__spirv::BuiltInSubgroupMaxSize();
+  }
+
+  id<1> get_group_id() const {
+    return get_sub_group_id(); // cl::__spirv::BuiltInSubgroupId();
+  }
+
+  size_t get_group_range() const {
+    return get_num_sub_groups(); // cl::__spirv::BuiltInNumSubgroups();
+  }
+
+  size_t get_uniform_group_range() const {
+    return get_enqueued_num_sub_groups(); // cl::__spirv::BuiltInNumEnqueuedSubgroups();
+  }
+
+  /* --- vote / ballot functions --- */
+
+  bool any(bool predicate) { return sub_group_any(predicate); }
+
+  bool all(bool predicate) { return sub_group_all(predicate); }
+
+  /* --- collectives --- */
+
+  template <typename T> T broadcast(T x, id<1> local_id) {
+    return sub_group_broadcast(x, local_id.get(0));
+  }
+
+  template <typename T, class BinaryOperation> T reduce(T x) {
+    BinaryOperation o(Operation::reduce);
+    return o(x);
+  }
+
+  template <typename T, class BinaryOperation> T exclusive_scan(T x) {
+    BinaryOperation o(Operation::exclusive_scan);
+    return o(x);
+  }
+
+  template <typename T, class BinaryOperation> T inclusive_scan(T x) {
+    BinaryOperation o(Operation::inclusive_scan);
+    return o(x);
+  }
+
+  /* --- one - input shuffles --- */
+  /* indices in [0 , sub - group size ) */
+
+  template <typename T> T shuffle(T x, id<1> local_id) {
+    return intel_sub_group_shuffle(x, local_id.get(0));
+  }
+
+  template <typename T> T shuffle_down(T x, uint32_t delta) {
+    return intel_sub_group_shuffle_down(x, x, delta);
+  }
+
+  template <typename T> T shuffle_up(T x, uint32_t delta) {
+    return intel_sub_group_shuffle_up(x, x, delta);
+  }
+
+  template <typename T> T shuffle_xor(T x, id<1> value) {
+    return intel_sub_group_shuffle_xor(x, value.get(0));
+  }
+
+  /* --- two - input shuffles --- */
+  /* indices in [0 , 2* sub - group size ) */
+  template <typename T> T shuffle(T x, T y, id<1> local_id) {
+    return intel_sub_group_shuffle_down(
+        x, y, local_id.get(0) - get_local_id().get(0));
+  }
+
+  template <typename T> T shuffle_down(T current, T next, uint32_t delta) {
+    return intel_sub_group_shuffle_down(current, next, delta);
+  }
+  template <typename T> T shuffle_up(T previous, T current, uint32_t delta) {
+    return intel_sub_group_shuffle_up(previous, current, delta);
+  }
+
+  /* --- sub - group load / stores --- */
+  /* these can map to SIMD or block read / write hardware where available */
+
+  template <typename T, access::address_space Space>
+  typename std::enable_if<sizeof(T) == sizeof(uint), T>::type
+  load(const multi_ptr<T, Space> src) {
+    uint t = intel_sub_group_block_read((const __global uint *)src.get());
+    return *((T *)&t);
+  }
+
+  template <typename T, access::address_space Space>
+  typename std::enable_if<sizeof(T) == sizeof(ushort), T>::type
+  load(const multi_ptr<T, Space> src) {
+    ushort t =
+        intel_sub_group_block_read_us((const __global ushort *)src.get());
+    return *((T *)&t);
+  }
+
+  template <int N, typename T, access::address_space Space>
+  typename std::enable_if<sizeof(T) == sizeof(uint) && N == 1, T>::type
+  load(const multi_ptr<T, Space> src) {
+    uint t = intel_sub_group_block_read((const __global uint *)src.get());
+    return *((T *)&t);
+  }
+
+  template <int N, typename T, access::address_space Space>
+  typename std::enable_if<sizeof(T) == sizeof(ushort) && N == 1, T>::type
+  load(const multi_ptr<T, Space> src) {
+    uint t = intel_sub_group_block_read_us((const __global ushort *)src.get());
+    return *((T *)&t);
+  }
+
+  template <int N, typename T, access::address_space Space>
+  vec<typename std::enable_if<sizeof(T) == sizeof(uint) && N == 2, T>::type, N>
+  load(const multi_ptr<T, Space> src) {
+    uint2 t = intel_sub_group_block_read2((const __global uint *)src.get());
+    return *((typename vec<T, N>::vector_t *)(&t));
+  }
+
+  template <int N, typename T, access::address_space Space>
+  vec<typename std::enable_if<sizeof(T) == sizeof(ushort) && N == 2, T>::type,
+      N>
+  load(const multi_ptr<T, Space> src) {
+    ushort2 t =
+        intel_sub_group_block_read_us2((const __global ushort *)src.get());
+    return *((typename vec<T, N>::vector_t *)(&t));
+  }
+
+  template <int N, typename T, access::address_space Space>
+  vec<typename std::enable_if<sizeof(T) == sizeof(uint) && N == 4, T>::type, N>
+  load(const multi_ptr<T, Space> src) {
+    uint4 t = intel_sub_group_block_read4((const __global uint *)src.get());
+    return *((typename vec<T, N>::vector_t *)(&t));
+  }
+
+  template <int N, typename T, access::address_space Space>
+  vec<typename std::enable_if<sizeof(T) == sizeof(ushort) && N == 4, T>::type,
+      N>
+  load(const multi_ptr<T, Space> src) {
+    ushort4 t =
+        intel_sub_group_block_read_us4((const __global ushort *)src.get());
+    return *((typename vec<T, N>::vector_t *)(&t));
+  }
+
+  template <int N, typename T, access::address_space Space>
+  vec<typename std::enable_if<sizeof(T) == sizeof(uint) && N == 8, T>::type, N>
+  load(const multi_ptr<T, Space> src) {
+    uint8 t = intel_sub_group_block_read8((const __global uint *)src.get());
+    return *((typename vec<T, N>::vector_t *)(&t));
+  }
+
+  template <int N, typename T, access::address_space Space>
+  vec<typename std::enable_if<sizeof(T) == sizeof(ushort) && N == 8, T>::type,
+      N>
+  load(const multi_ptr<T, Space> src) {
+    ushort8 t =
+        intel_sub_group_block_read_us8((const __global ushort *)src.get());
+    return *((typename vec<T, N>::vector_t *)(&t));
+  }
+
+  template <typename T, access::address_space Space>
+  void
+  store(multi_ptr<T, Space> dst,
+        const typename std::enable_if<sizeof(T) == sizeof(uint), T>::type &x) {
+    intel_sub_group_block_write((__global uint *)dst.get(), *((uint *)&x));
+  }
+
+  template <typename T, access::address_space Space>
+  void store(
+      multi_ptr<T, Space> dst,
+      const typename std::enable_if<sizeof(T) == sizeof(ushort), T>::type &x) {
+    intel_sub_group_block_write_us((__global ushort *)dst.get(),
+                                   *((ushort *)&x));
+  }
+
+  template <int N, typename T, access::address_space Space>
+  void store(multi_ptr<T, Space> dst,
+             const typename std::enable_if<sizeof(T) == sizeof(uint) && N == 1,
+                                           T>::type &x) {
+    intel_sub_group_block_write((__global uint *)dst.get(), *((uint *)&x));
+  }
+
+  template <int N, typename T, access::address_space Space>
+  void
+  store(multi_ptr<T, Space> dst,
+        const typename std::enable_if<sizeof(T) == sizeof(ushort) && N == 1,
+                                      T>::type &x) {
+    intel_sub_group_block_write_us((__global ushort *)dst.get(),
+                                   *((ushort *)&x));
+  }
+
+  template <int N, typename T, access::address_space Space>
+  void store(
+      multi_ptr<T, Space> dst,
+      const vec<
+          typename std::enable_if<sizeof(T) == sizeof(uint) && N == 2, T>::type,
+          N> &x) {
+    typename vec<T, N>::vector_t t = x;
+    intel_sub_group_block_write2((__global uint *)dst.get(), *((uint2 *)&t));
+  }
+  template <int N, typename T, access::address_space Space>
+  void
+  store(multi_ptr<T, Space> dst,
+        const vec<typename std::enable_if<sizeof(T) == sizeof(ushort) && N == 2,
+                                          T>::type,
+                  N> &x) {
+    typename vec<T, N>::vector_t t = x;
+    intel_sub_group_block_write_us2((__global ushort *)dst.get(),
+                                    *((ushort2 *)&t));
+  }
+
+  template <int N, typename T, access::address_space Space>
+  void store(
+      multi_ptr<T, Space> dst,
+      const vec<
+          typename std::enable_if<sizeof(T) == sizeof(uint) && N == 4, T>::type,
+          N> &x) {
+    typename vec<T, N>::vector_t t = x;
+    intel_sub_group_block_write4((__global uint *)dst.get(), *((uint4 *)&t));
+  }
+
+  template <int N, typename T, access::address_space Space>
+  void
+  store(multi_ptr<T, Space> dst,
+        const vec<typename std::enable_if<sizeof(T) == sizeof(ushort) && N == 4,
+                                          T>::type,
+                  N> &x) {
+    typename vec<T, N>::vector_t t = x;
+    intel_sub_group_block_write_us4((__global ushort *)dst.get(),
+                                    *((ushort4 *)&t));
+  }
+
+  template <int N, typename T, access::address_space Space>
+  void store(
+      multi_ptr<T, Space> dst,
+      const vec<
+          typename std::enable_if<sizeof(T) == sizeof(uint) && N == 8, T>::type,
+          N> &x) {
+    typename vec<T, N>::vector_t t = x;
+    intel_sub_group_block_write8((__global uint *)dst.get(), *((uint8 *)&t));
+  }
+
+  template <int N, typename T, access::address_space Space>
+  void
+  store(multi_ptr<T, Space> dst,
+        const vec<typename std::enable_if<sizeof(T) == sizeof(ushort) && N == 8,
+                                          T>::type,
+                  N> &x) {
+    typename vec<T, N>::vector_t t = x;
+    intel_sub_group_block_write_us8((__global ushort *)dst.get(),
+                                    *((ushort8 *)&t));
+  }
+
+  /* --- synchronization functions --- */
+  void barrier(access::fence_space accessSpace =
+                   access::fence_space::global_and_local) const {
+    cl::sycl::detail::cl_mem_fence_flags flags;
+    switch (accessSpace) {
+    case access::fence_space::local_space:
+      flags = cl::sycl::detail::CLK_LOCAL_MEM_FENCE;
+      break;
+    case access::fence_space::global_space:
+      flags = cl::sycl::detail::CLK_GLOBAL_MEM_FENCE;
+      break;
+    case access::fence_space::global_and_local:
+    default:
+      flags = cl::sycl::detail::CLK_LOCAL_MEM_FENCE |
+              cl::sycl::detail::CLK_GLOBAL_MEM_FENCE;
+      break;
+    }
+    ::sub_group_barrier(flags);
+  }
+
+protected:
+  template <int dimensions> friend struct cl::sycl::nd_item;
+  sub_group() = default;
+};
+} // namespace intel
+} // namespace sycl
+} // namespace cl
+#else
+#include <CL/sycl/intel/sub_group_host.hpp>
+#endif
diff --git a/sycl/include/CL/sycl/intel/sub_group_host.hpp b/sycl/include/CL/sycl/intel/sub_group_host.hpp
new file mode 100644
index 000000000000..f9c43e1cbf79
--- /dev/null
+++ b/sycl/include/CL/sycl/intel/sub_group_host.hpp
@@ -0,0 +1,147 @@
+//==- sub_group_host.hpp --- SYCL sub-group for host device  ---------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/id.hpp>
+#include <CL/sycl/range.hpp>
+#include <CL/sycl/types.hpp>
+#ifndef __SYCL_DEVICE_ONLY__
+
+namespace cl {
+namespace sycl {
+template <typename T, access::address_space Space> class multi_ptr;
+namespace intel {
+struct minimum {};
+struct maximum {};
+struct plus {};
+
+struct sub_group {
+  /* --- common interface members --- */
+
+  id<1> get_local_id() const {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+  range<1> get_local_range() const {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  range<1> get_max_local_range() const {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  id<1> get_group_id() const {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  size_t get_group_range() const {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  size_t get_uniform_group_range() const {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  /* --- vote / ballot functions --- */
+
+  bool any(bool predicate) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  bool all(bool predicate) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  /* --- collectives --- */
+
+  template <typename T> T broadcast(T x, id<1> local_id) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  template <typename T, class BinaryOperation> T reduce(T x) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  template <typename T, class BinaryOperation> T exclusive_scan(T x) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  template <typename T, class BinaryOperation> T inclusive_scan(T x) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  /* --- one - input shuffles --- */
+  /* indices in [0 , sub - group size ) */
+
+  template <typename T> T shuffle(T x, id<1> local_id) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  template <typename T> T shuffle_down(T x, uint32_t delta) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+  template <typename T> T shuffle_up(T x, uint32_t delta) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  template <typename T> T shuffle_xor(T x, id<1> value) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  /* --- two - input shuffles --- */
+  /* indices in [0 , 2* sub - group size ) */
+  template <typename T> T shuffle(T x, T y, id<1> local_id) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+  template <typename T> T shuffle_down(T current, T next, uint32_t delta) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+  template <typename T> T shuffle_up(T previous, T current, uint32_t delta) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  /* --- sub - group load / stores --- */
+  /* these can map to SIMD or block read / write hardware where available */
+  template <typename T, access::address_space Space>
+  T load(const multi_ptr<T, Space> src) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  template <int N, typename T, access::address_space Space>
+  vec<T, N> load(const multi_ptr<T, Space> src) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  template <typename T, access::address_space Space>
+  void store(multi_ptr<T, Space> dst, T &x) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  template <int N, typename T, access::address_space Space>
+  void store(multi_ptr<T, Space> dst, const vec<T, N> &x) {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+  /* --- synchronization functions --- */
+  void barrier(access::fence_space accessSpace =
+                   access::fence_space::global_and_local) const {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+
+protected:
+  template <int dimensions> friend struct cl::sycl::nd_item;
+  sub_group() {
+    throw runtime_error("Subgroups are not supported on host device. ");
+  }
+};
+} // namespace intel
+} // namespace sycl
+} // namespace cl
+#endif
diff --git a/sycl/include/CL/sycl/item.hpp b/sycl/include/CL/sycl/item.hpp
new file mode 100644
index 000000000000..a6c860d73d5d
--- /dev/null
+++ b/sycl/include/CL/sycl/item.hpp
@@ -0,0 +1,110 @@
+//==------------ item.hpp --- SYCL iteration item --------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/id.hpp>
+#include <CL/sycl/range.hpp>
+#include <stdexcept>
+#include <type_traits>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+class Builder;
+}
+template <int dimensions> struct id;
+template <int dimensions> struct range;
+template <int dimensions = 1, bool with_offset = true> struct item {
+
+  item() = delete;
+
+  id<dimensions> get_id() const { return index; }
+
+  size_t get_id(int dimension) const { return index[dimension]; }
+
+  size_t &operator[](int dimension) { return index[dimension]; }
+
+  size_t operator[](int dimension) const { return index[dimension]; }
+
+  range<dimensions> get_range() const { return extent; }
+
+  size_t get_range(int dimension) const { return extent.get(dimension); }
+
+  // only available if with_offset is true;
+  template <bool W = with_offset,
+            typename = typename std::enable_if<(W == true)>::type>
+  id<dimensions> get_offset() const {
+    return offset;
+  }
+
+  template <bool W = with_offset>
+  operator typename std::enable_if<W == false, item<dimensions, true>>::type()
+      const {
+    return item<dimensions, true>(extent, index, offset);
+  }
+
+  /* The following member function is only available in the id class
+   * specialization where: dimensions>0 and dimensions<4 */
+  template <int N = dimensions,
+            typename = typename std::enable_if<((N > 0) && (N < 4))>::type>
+  size_t get_linear_id() const {
+    if (1 == dimensions) {
+      return index[0] - offset[0];
+    }
+    if (2 == dimensions) {
+      return (index[0] - offset[0]) * extent[1] + (index[1] - offset[1]);
+    }
+    return ((index[0] - offset[0]) * extent[1] * extent[2]) +
+           ((index[1] - offset[1]) * extent[2]) + (index[2] - offset[2]);
+  }
+
+  item<dimensions, with_offset>(const item<dimensions, with_offset> &rhs) =
+      default;
+
+  item<dimensions, with_offset>(item<dimensions, with_offset> &&rhs) = default;
+
+  item<dimensions, with_offset> &
+  operator=(const item<dimensions, with_offset> &rhs) = default;
+
+  item<dimensions, with_offset> &
+  operator=(item<dimensions, with_offset> &&rhs) = default;
+
+  bool operator==(const item<dimensions, with_offset> &rhs) const {
+    return (rhs.index == this->index) && (rhs.extent == this->extent) &&
+           (rhs.offset == this->offset);
+  }
+
+  bool operator!=(const item<dimensions, with_offset> &rhs) const {
+    return !((*this) == rhs);
+  }
+
+protected:
+  // For call constructor inside conversion operator
+  friend class item<dimensions, false>;
+  friend class detail::Builder;
+
+  template <size_t W = with_offset>
+  item(typename std::enable_if<(W == true), const range<dimensions>>::type &R,
+       const id<dimensions> &I, const id<dimensions> &O)
+      : extent(R), index(I), offset(O) {}
+
+  template <size_t W = with_offset>
+  item(typename std::enable_if<(W == false), const range<dimensions>>::type &R,
+       const id<dimensions> &I)
+      : extent(R), index(I), offset() {}
+
+private:
+  range<dimensions> extent;
+  id<dimensions> index;
+  id<dimensions> offset;
+};
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/kernel.hpp b/sycl/include/CL/sycl/kernel.hpp
new file mode 100644
index 000000000000..ca3d7aae19c3
--- /dev/null
+++ b/sycl/include/CL/sycl/kernel.hpp
@@ -0,0 +1,94 @@
+//==--------------- kernel.hpp --- SYCL kernel -----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/kernel_impl.hpp>
+#include <CL/sycl/stl.hpp>
+
+#include <memory>
+
+namespace cl {
+namespace sycl {
+// Forward declaration
+class program;
+class context;
+
+class kernel {
+  template <class T>
+  friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject);
+  template <class T>
+  friend T detail::createSyclObjFromImpl(decltype(T::impl) ImplObj);
+
+public:
+  kernel(cl_kernel clKernel, const context &syclContext)
+      : impl(std::make_shared<detail::kernel_impl>(clKernel, syclContext)) {}
+
+  kernel(const kernel &rhs) = default;
+
+  kernel(kernel &&rhs) = default;
+
+  kernel &operator=(const kernel &rhs) = default;
+
+  kernel &operator=(kernel &&rhs) = default;
+
+  bool operator==(const kernel &rhs) const { return impl == rhs.impl; }
+
+  bool operator!=(const kernel &rhs) const { return !operator==(rhs); }
+
+  cl_kernel get() const { return impl->get(); }
+
+  bool is_host() const { return impl->is_host(); }
+
+  context get_context() const { return impl->get_context(); }
+
+  program get_program() const;
+
+  template <info::kernel param>
+  typename info::param_traits<info::kernel, param>::return_type
+  get_info() const {
+    return impl->get_info<param>();
+  }
+
+  template <info::kernel_work_group param>
+  typename info::param_traits<info::kernel_work_group, param>::return_type
+  get_work_group_info(const device &dev) const {
+    return impl->get_work_group_info<param>(dev);
+  }
+
+  template <info::kernel_sub_group param>
+  typename info::param_traits<info::kernel_sub_group, param>::return_type
+  get_sub_group_info(const device &dev) const {
+    return impl->get_sub_group_info<param>(dev);
+  }
+
+  template <info::kernel_sub_group param>
+  typename info::param_traits<info::kernel_sub_group, param>::return_type
+  get_sub_group_info(const device &dev,
+                     typename info::param_traits<info::kernel_sub_group,
+                                                 param>::input_type val) const {
+    return impl->get_sub_group_info<param>(dev, val);
+  }
+
+private:
+  kernel(std::shared_ptr<detail::kernel_impl> impl) : impl(impl) {}
+
+  std::shared_ptr<detail::kernel_impl> impl;
+};
+} // namespace sycl
+} // namespace cl
+
+namespace std {
+template <> struct hash<cl::sycl::kernel> {
+  size_t operator()(const cl::sycl::kernel &k) const {
+    return hash<std::shared_ptr<cl::sycl::detail::kernel_impl>>()(
+        cl::sycl::detail::getSyclObjImpl(k));
+  }
+};
+} // namespace std
diff --git a/sycl/include/CL/sycl/macro.hpp b/sycl/include/CL/sycl/macro.hpp
new file mode 100644
index 000000000000..526bf234c9a0
--- /dev/null
+++ b/sycl/include/CL/sycl/macro.hpp
@@ -0,0 +1,11 @@
+//==-------------- macro.hpp - SYCL macro header ---------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#define CL_SYCL_LANGUAGE_VERSION 121
diff --git a/sycl/include/CL/sycl/math.hpp b/sycl/include/CL/sycl/math.hpp
new file mode 100644
index 000000000000..0b88b3de55ff
--- /dev/null
+++ b/sycl/include/CL/sycl/math.hpp
@@ -0,0 +1,307 @@
+//==----------- math.hpp - SYCL math functions ------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/types.hpp>
+
+#include <cmath>
+
+#ifdef __SYCL_DEVICE_ONLY__
+
+#define CONCAT_HELP(a, b) a##b
+#define CONCAT(a, b) CONCAT_HELP(a, b)
+
+#define SCALAR(type) CONCAT(CONCAT(__, type), _t)
+#define VECTOR(type, len) CONCAT(CONCAT(CONCAT(__, type), len), _vec_t)
+
+#define MAKE_FUN_OF_1_ARG(name, ret_ty, arg_1_ty) ret_ty name(arg_1_ty);
+
+#define MAKE_FUN_OF_2_ARG(name, ret_ty, arg_1_ty, arg_2_ty)                    \
+  ret_ty name(arg_1_ty, arg_2_ty);
+
+#define MAKE_FUN_OF_3_ARG(name, ret_ty, arg_1_ty, arg_2_ty, arg_3_ty)          \
+  ret_ty name(arg_1_ty, arg_2_ty, arg_3_ty);
+
+#define GEN_FUNC_OF_ONE_ARG_V(name, ret_ty, arg_1_ty)                          \
+  MAKE_FUN_OF_1_ARG(name, VECTOR(ret_ty, 2), VECTOR(arg_1_ty, 2))              \
+  MAKE_FUN_OF_1_ARG(name, VECTOR(ret_ty, 3), VECTOR(arg_1_ty, 3))              \
+  MAKE_FUN_OF_1_ARG(name, VECTOR(ret_ty, 4), VECTOR(arg_1_ty, 4))              \
+  MAKE_FUN_OF_1_ARG(name, VECTOR(ret_ty, 8), VECTOR(arg_1_ty, 8))              \
+  MAKE_FUN_OF_1_ARG(name, VECTOR(ret_ty, 16), VECTOR(arg_1_ty, 16))
+
+#define GEN_FUNC_OF_TWO_ARG_V(name, ret_ty, arg_1_ty, arg_2_ty)                \
+  MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 2), VECTOR(arg_1_ty, 2),              \
+                    VECTOR(arg_2_ty, 2))                                       \
+  MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 3), VECTOR(arg_1_ty, 3),              \
+                    VECTOR(arg_2_ty, 3))                                       \
+  MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 4), VECTOR(arg_1_ty, 4),              \
+                    VECTOR(arg_2_ty, 4))                                       \
+  MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 8), VECTOR(arg_1_ty, 8),              \
+                    VECTOR(arg_2_ty, 8))                                       \
+  MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 16), VECTOR(arg_1_ty, 16),            \
+                    VECTOR(arg_2_ty, 16))
+
+#define GEN_FUNC_OF_THREE_ARG_V(name, ret_ty, arg_1_ty, arg_2_ty, arg_3_ty)    \
+  MAKE_FUN_OF_3_ARG(name, VECTOR(ret_ty, 2), VECTOR(arg_1_ty, 2),              \
+                    VECTOR(arg_2_ty, 2), VECTOR(arg_3_ty, 2))                  \
+  MAKE_FUN_OF_3_ARG(name, VECTOR(ret_ty, 3), VECTOR(arg_1_ty, 3),              \
+                    VECTOR(arg_2_ty, 3), VECTOR(arg_3_ty, 3))                  \
+  MAKE_FUN_OF_3_ARG(name, VECTOR(ret_ty, 4), VECTOR(arg_1_ty, 4),              \
+                    VECTOR(arg_2_ty, 4), VECTOR(arg_3_ty, 4))                  \
+  MAKE_FUN_OF_3_ARG(name, VECTOR(ret_ty, 8), VECTOR(arg_1_ty, 8),              \
+                    VECTOR(arg_2_ty, 8), VECTOR(arg_3_ty, 8))                  \
+  MAKE_FUN_OF_3_ARG(name, VECTOR(ret_ty, 16), VECTOR(arg_1_ty, 16),            \
+                    VECTOR(arg_2_ty, 16), VECTOR(arg_3_ty, 16))
+
+#define GEN_FUNC_OF_ONE_ARG_S(name, ret_ty, arg_1_ty)                          \
+  MAKE_FUN_OF_1_ARG(name, SCALAR(ret_ty), SCALAR(arg_1_ty))
+
+#define GEN_FUNC_OF_TWO_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty)                \
+  MAKE_FUN_OF_2_ARG(name, SCALAR(ret_ty), SCALAR(arg_1_ty), SCALAR(arg_2_ty))
+
+#define GEN_FUNC_OF_THREE_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty, arg_3_ty)    \
+  MAKE_FUN_OF_3_ARG(name, SCALAR(ret_ty), SCALAR(arg_1_ty), SCALAR(arg_2_ty),  \
+                    SCALAR(arg_3_ty))
+
+#define GEN_FUNC_OF_ONE_ARG(name, ret_ty, arg_1_ty)                            \
+  GEN_FUNC_OF_ONE_ARG_S(name, ret_ty, arg_1_ty)                                \
+  GEN_FUNC_OF_ONE_ARG_V(name, ret_ty, arg_1_ty)
+
+#define GEN_FUNC_OF_TWO_ARG(name, ret_ty, arg_1_ty, arg_2_ty)                  \
+  GEN_FUNC_OF_TWO_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty)                      \
+  GEN_FUNC_OF_TWO_ARG_V(name, ret_ty, arg_1_ty, arg_2_ty)
+
+#define GEN_FUNC_OF_THREE_ARG(name, ret_ty, arg_1_ty, arg_2_ty, arg_3_ty)      \
+  GEN_FUNC_OF_THREE_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty, arg_3_ty)          \
+  GEN_FUNC_OF_THREE_ARG_V(name, ret_ty, arg_1_ty, arg_2_ty, arg_3_ty)
+
+#define GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty)   \
+  MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 2), VECTOR(arg_1_ty, 2),              \
+                    SCALAR(arg_2_ty))                                          \
+  MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 3), VECTOR(arg_1_ty, 3),              \
+                    SCALAR(arg_2_ty))                                          \
+  MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 4), VECTOR(arg_1_ty, 4),              \
+                    SCALAR(arg_2_ty))                                          \
+  MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 8), VECTOR(arg_1_ty, 8),              \
+                    SCALAR(arg_2_ty))                                          \
+  MAKE_FUN_OF_2_ARG(name, VECTOR(ret_ty, 16), VECTOR(arg_1_ty, 16),            \
+                    SCALAR(arg_2_ty))
+
+#define GEN_FUNC_OF_TWO_ARG_S_SECOND_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty)   \
+  MAKE_FUN_OF_2_ARG(name, SCALAR(ret_ty), SCALAR(arg_1_ty), SCALAR(arg_2_ty))
+
+#define GEN_FUNC_OF_TWO_ARG_SECOND_ARG_S(name, ret_ty, arg_1_ty, arg_2_ty)     \
+  GEN_FUNC_OF_TWO_ARG_S_SECOND_ARG_S(name, ret_ty, arg_1_ty)                   \
+  GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty)
+
+#define GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(name, ret_ty, arg_1_ty)  \
+  GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, char)             \
+  GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, uchar)            \
+  GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, short)            \
+  GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, ushort)           \
+  GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, int)              \
+  GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, uint)             \
+  GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, long)             \
+  GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(name, ret_ty, arg_1_ty, ulong)
+
+namespace cl {
+namespace __spirv {
+/* ----------------- 4.13.3 Math functions. Device version ------------------*/
+// TODO: Enable built-in functions with 'half' parameters once 'half' data type
+/// is supported by the clang
+// genfloat exp (genfloat x )
+GEN_FUNC_OF_ONE_ARG(exp, float, float)
+GEN_FUNC_OF_ONE_ARG(exp, double, double)
+// GEN_FUNC_OF_ONE_ARG(exp, half, half)
+
+// genfloat fmax (genfloat x, genfloat y)
+GEN_FUNC_OF_TWO_ARG(fmax, float, float, float)
+GEN_FUNC_OF_TWO_ARG(fmax, double, double, double)
+// GEN_FUNC_OF_TWO_ARG(fmax, half, half, half)
+
+// genfloat fmax (genfloat x, sgenfloat y)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, float, float, float)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, double, double, float)
+// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, half, half, float)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, float, float, double)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, double, double, double)
+// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, half, half, double)
+// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, float, float, half)
+// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, double, double, half)
+// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmax, half, half, half)
+
+// genfloat fmin (genfloat x, genfloat y)
+GEN_FUNC_OF_TWO_ARG(fmin, float, float, float)
+GEN_FUNC_OF_TWO_ARG(fmin, double, double, double)
+// GEN_FUNC_OF_TWO_ARG(fmin, half, half, half)
+
+// genfloat fmin (genfloat x, sgenfloat y)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, float, float, float)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, double, double, float)
+// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, half, half, float)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, float, float, double)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, double, double, double)
+// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, half, half, double)
+// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, float, float, half)
+// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, double, double, half)
+// GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S(fmin, half, half, half)
+
+// genfloat sqrt (genfloat x)
+GEN_FUNC_OF_ONE_ARG(sqrt, float, float)
+GEN_FUNC_OF_ONE_ARG(sqrt, double, double)
+// GEN_FUNC_OF_ONE_ARG(sqrt, half, half)
+
+// genfloatf log (genfloatf x)
+GEN_FUNC_OF_ONE_ARG(log, float, float)
+
+// genfloatf sin (genfloatf x)
+GEN_FUNC_OF_ONE_ARG(sin, float, float)
+
+// genfloatf cos (genfloatf x)
+GEN_FUNC_OF_ONE_ARG(cos, float, float)
+
+// genfloat mad (genfloat a, genfloat b, genfloat c)
+GEN_FUNC_OF_THREE_ARG(mad, float, float, float, float)
+GEN_FUNC_OF_THREE_ARG(mad, double, double, double, double)
+// GEN_FUNC_OF_THREE_ARG_V(mad, half, half, half, half)
+
+// genfloatf exp (genfloatf x)
+GEN_FUNC_OF_ONE_ARG(native_exp, float, float)
+
+// genfloatf fabs (genfloatf x)
+GEN_FUNC_OF_ONE_ARG(fabs, float, float)
+GEN_FUNC_OF_ONE_ARG(fabs, double, double)
+// GEN_FUNC_OF_ONE_ARG(fabs, half, half)
+
+/* --------------- 4.13.4 Integer functions. Device version -----------------*/
+// geninteger max (geninteger x, geninteger y)
+GEN_FUNC_OF_TWO_ARG(max, char, char, char)
+GEN_FUNC_OF_TWO_ARG(max, uchar, uchar, uchar)
+GEN_FUNC_OF_TWO_ARG(max, short, short, short)
+GEN_FUNC_OF_TWO_ARG(max, ushort, ushort, ushort)
+GEN_FUNC_OF_TWO_ARG(max, int, int, int)
+GEN_FUNC_OF_TWO_ARG(max, uint, uint, uint)
+GEN_FUNC_OF_TWO_ARG(max, long, long, long)
+GEN_FUNC_OF_TWO_ARG(max, ulong, ulong, ulong)
+
+// geninteger max (geninteger x, sgeninteger y)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, char, char)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, uchar, uchar)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, short, short)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, ushort, ushort)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, int, int)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, uint, uint)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, long, long)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(max, ulong, ulong)
+
+// geninteger min (geninteger x, geninteger y)
+GEN_FUNC_OF_TWO_ARG(min, char, char, char)
+GEN_FUNC_OF_TWO_ARG(min, uchar, uchar, uchar)
+GEN_FUNC_OF_TWO_ARG(min, short, short, short)
+GEN_FUNC_OF_TWO_ARG(min, ushort, ushort, ushort)
+GEN_FUNC_OF_TWO_ARG(min, int, int, int)
+GEN_FUNC_OF_TWO_ARG(min, uint, uint, uint)
+GEN_FUNC_OF_TWO_ARG(min, long, long, long)
+GEN_FUNC_OF_TWO_ARG(min, ulong, ulong, ulong)
+
+// geninteger min (geninteger x, sgeninteger y)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, char, char)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, uchar, uchar)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, short, short)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, ushort, ushort)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, int, int)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, uint, uint)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, long, long)
+GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER(min, ulong, ulong)
+/* --------------- 4.13.5 Common functions. Device version ------------------*/
+/* --------------- 4.13.6 Geometric Functions. Device version ---------------*/
+/* --------------- 4.13.7 Relational functions. Device version --------------*/
+} // namespace __spirv
+} // namespace cl
+
+#undef CONCAT_HELP
+#undef CONCAT
+#undef SCALAR
+#undef VECTOR
+#undef MAKE_FUN_OF_1_ARG
+#undef MAKE_FUN_OF_2_ARG
+#undef MAKE_FUN_OF_3_ARG
+#undef GEN_FUNC_OF_ONE_ARG_V
+#undef GEN_FUNC_OF_TWO_ARG_V
+#undef GEN_FUNC_OF_THREE_ARG_V
+#undef GEN_FUNC_OF_ONE_ARG_S
+#undef GEN_FUNC_OF_TWO_ARG_S
+#undef GEN_FUNC_OF_THREE_ARG_S
+#undef GEN_FUNC_OF_ONE_ARG
+#undef GEN_FUNC_OF_TWO_ARG
+#undef GEN_FUNC_OF_THREE_ARG
+#undef GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S
+#undef GEN_FUNC_OF_TWO_ARG_S_SECOND_ARG_S
+#undef GEN_FUNC_OF_TWO_ARG_SECOND_ARG_S
+#undef GEN_FUNC_OF_TWO_ARG_V_SECOND_ARG_S_GENINTEGER
+#endif // __SYCL_DEVICE_ONLY__
+
+#ifdef __SYCL_DEVICE_ONLY__
+namespace __sycl_std = cl::__spirv;
+#else
+namespace __sycl_std = std;
+#endif
+
+namespace cl {
+namespace sycl {
+template <typename T> T cos(T x) {
+  return __sycl_std::cos(x);
+}
+template <typename T> T exp(T x) {
+  return __sycl_std::exp(x);
+}
+template <typename T1, typename T2> T1 fmax(T1 x, T2 y) {
+  return __sycl_std::fmax(x, y);
+}
+template <typename T1, typename T2> T1 fmin(T1 x, T2 y) {
+  return __sycl_std::fmin(x, y);
+}
+template <typename T> T log(T x) {
+  return __sycl_std::log(x);
+}
+template <typename T> T mad(T a, T b, T c) {
+#ifdef __SYCL_DEVICE_ONLY__
+  return __sycl_std::mad(a, b, c);
+#else
+  return (a * b) + c;
+#endif
+}
+template <typename T1, typename T2> T1 max(T1 x, T2 y) {
+  return __sycl_std::max(x, y);
+}
+template <typename T1, typename T2> T1 min(T1 x, T2 y) {
+  return __sycl_std::min(x, y);
+}
+template <typename T> T sin(T x) {
+  return __sycl_std::sin(x);
+}
+template <typename T> T sqrt(T x) {
+  return __sycl_std::sqrt(x);
+}
+template <typename T> T fabs(T x) {
+  return __sycl_std::fabs(x);
+}
+namespace native {
+template <typename T> T exp(T x) {
+#ifdef __SYCL_DEVICE_ONLY__
+  return __sycl_std::native_exp(x);
+#else
+  return __sycl_std::exp(x);
+#endif
+}
+} // namespace native
+namespace half_precision {} // namespace half_precision
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/multi_ptr.hpp b/sycl/include/CL/sycl/multi_ptr.hpp
new file mode 100644
index 000000000000..357aa1cd26f5
--- /dev/null
+++ b/sycl/include/CL/sycl/multi_ptr.hpp
@@ -0,0 +1,335 @@
+//==------------ multi_ptr.hpp - SYCL multi_ptr class ----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <cassert>
+#include <cstddef>
+
+namespace cl {
+namespace sycl {
+template <typename ElementType, access::address_space Space> class multi_ptr {
+public:
+  using element_type = ElementType;
+  using difference_type = std::ptrdiff_t;
+
+  // Implementation defined pointer and reference types that correspond to
+  // SYCL/OpenCL interoperability types for OpenCL C functions
+  using pointer_t = typename detail::PtrValueType<ElementType, Space>::type *;
+  using const_pointer_t =
+      typename detail::PtrValueType<ElementType, Space>::type const *;
+  using reference_t = typename detail::PtrValueType<ElementType, Space>::type &;
+  using const_reference_t =
+      typename detail::PtrValueType<ElementType, Space>::type &;
+
+  static constexpr access::address_space address_space = Space;
+
+  // Constructors
+  multi_ptr() : m_Pointer(nullptr) {}
+  multi_ptr(const multi_ptr &rhs) = default;
+  multi_ptr(multi_ptr &&) = default;
+  multi_ptr(pointer_t pointer) : m_Pointer(pointer) {}
+#ifdef __SYCL_DEVICE_ONLY__
+  multi_ptr(ElementType *pointer)
+      : m_Pointer(reinterpret_cast<pointer_t>(pointer)) {
+    // TODO An implementation should reject an argument if the deduced
+    // address space is not compatible with Space.
+  }
+#endif
+  multi_ptr(std::nullptr_t) : m_Pointer(nullptr) {}
+  ~multi_ptr() = default;
+
+  // Assignment and access operators
+  multi_ptr &operator=(const multi_ptr &) = default;
+  multi_ptr &operator=(multi_ptr &&) = default;
+  multi_ptr &operator=(pointer_t pointer) {
+    m_Pointer = pointer;
+    return *this;
+  }
+#ifdef __SYCL_DEVICE_ONLY__
+  multi_ptr &operator=(ElementType *pointer) {
+    m_Pointer = reinterpret_cast<pointer_t>(pointer);
+    // TODO An implementation should reject an argument if the deduced
+    // address space is not compatible with Space.
+  }
+#endif
+  multi_ptr &operator=(std::nullptr_t) {
+    m_Pointer = nullptr;
+    return *this;
+  }
+  ElementType &operator*() const {
+    return *(reinterpret_cast<ElementType *>(m_Pointer));
+  }
+  ElementType *operator->() const {
+    return reinterpret_cast<ElementType *>(m_Pointer);
+  }
+  ElementType &operator[](difference_type index) {
+    return *(reinterpret_cast<ElementType *>(m_Pointer + index));
+  }
+  ElementType operator[](difference_type index) const {
+    return *(reinterpret_cast<ElementType *>(m_Pointer + index));
+  }
+
+  // Only if Space == global_space
+  template <int dimensions, access::mode Mode,
+            access::placeholder isPlaceholder,
+            access::address_space _Space = Space,
+            typename = typename std::enable_if<
+                _Space == Space &&
+                Space == access::address_space::global_space>::type>
+  multi_ptr(accessor<ElementType, dimensions, Mode,
+                     access::target::global_buffer, isPlaceholder>
+                Accessor)
+      : multi_ptr(Accessor.get_pointer()) {}
+
+  // Only if Space == local_space
+  template <
+      int dimensions, access::mode Mode, access::placeholder isPlaceholder,
+      access::address_space _Space = Space,
+      typename = typename std::enable_if<
+          _Space == Space && Space == access::address_space::local_space>::type>
+  multi_ptr(accessor<ElementType, dimensions, Mode, access::target::local,
+                     isPlaceholder>
+                Accessor)
+      : multi_ptr(Accessor.get_pointer()) {}
+
+  // Only if Space == constant_space
+  template <int dimensions, access::mode Mode,
+            access::placeholder isPlaceholder,
+            access::address_space _Space = Space,
+            typename = typename std::enable_if<
+                _Space == Space &&
+                Space == access::address_space::constant_space>::type>
+  multi_ptr(accessor<ElementType, dimensions, Mode,
+                     access::target::constant_buffer, isPlaceholder>
+                Accessor)
+      : multi_ptr(Accessor.get_pointer()) {}
+
+  // Returns the underlying OpenCL C pointer
+  pointer_t get() const { return m_Pointer; }
+
+  // Implicit conversion to the underlying pointer type
+  operator ElementType *() const {
+    return reinterpret_cast<ElementType *>(m_Pointer);
+  }
+
+  // Explicit conversion to a multi_ptr<void>
+  explicit operator multi_ptr<void, Space>() const;
+
+  // Arithmetic operators
+  multi_ptr &operator++() {
+    m_Pointer += (difference_type)1;
+    return *this;
+  }
+  multi_ptr operator++(int) {
+    multi_ptr result(*this);
+    ++(*this);
+    return result;
+  }
+  multi_ptr &operator--() {
+    m_Pointer -= (difference_type)1;
+    return *this;
+  }
+  multi_ptr operator--(int) {
+    multi_ptr result(*this);
+    --(*this);
+    return result;
+  }
+  multi_ptr &operator+=(difference_type r) {
+    m_Pointer += r;
+    return *this;
+  }
+  multi_ptr &operator-=(difference_type r) {
+    m_Pointer -= r;
+    return *this;
+  }
+  multi_ptr operator+(difference_type r) const {
+    return multi_ptr(m_Pointer + r);
+  }
+  multi_ptr operator-(difference_type r) const {
+    return multi_ptr(m_Pointer - r);
+  }
+
+  void prefetch(size_t numElements) const;
+
+private:
+  pointer_t m_Pointer;
+};
+
+// Specialization of multi_ptr for void
+template <access::address_space Space> class multi_ptr<void, Space> {
+public:
+  using element_type = void;
+  using difference_type = std::ptrdiff_t;
+
+  // Implementation defined pointer types that correspond to
+  // SYCL/OpenCL interoperability types for OpenCL C functions
+  using pointer_t = typename detail::PtrValueType<void, Space>::type *;
+  using const_pointer_t =
+      typename detail::PtrValueType<void, Space>::type const *;
+
+  static constexpr access::address_space address_space = Space;
+
+  // Constructors
+  multi_ptr() : m_Pointer(nullptr) {}
+  multi_ptr(const multi_ptr &) = default;
+  multi_ptr(multi_ptr &&) = default;
+  multi_ptr(pointer_t pointer) : m_Pointer(pointer) {}
+#ifdef __SYCL_DEVICE_ONLY__
+  multi_ptr(void *pointer) : m_Pointer(reinterpret_cast<pointer_t>(pointer)) {
+    // TODO An implementation should reject an argument if the deduced
+    // address space is not compatible with Space.
+  }
+#endif
+  multi_ptr(std::nullptr_t) : m_Pointer(nullptr) {}
+  ~multi_ptr() = default;
+
+  // Assignment operators
+  multi_ptr &operator=(const multi_ptr &) = default;
+  multi_ptr &operator=(multi_ptr &&) = default;
+  multi_ptr &operator=(pointer_t pointer) {
+    m_Pointer = pointer;
+    return *this;
+  }
+#ifdef __SYCL_DEVICE_ONLY__
+  multi_ptr &operator=(void *pointer) {
+    m_Pointer = reinterpret_cast<pointer_t>(pointer);
+    // TODO An implementation should reject an argument if the deduced
+    // address space is not compatible with Space.
+  }
+#endif
+  multi_ptr &operator=(std::nullptr_t) {
+    m_Pointer = nullptr;
+    return *this;
+  }
+
+  // Only if Space == global_space
+  template <typename ElementType, int dimensions, access::mode Mode,
+            access::address_space _Space = Space,
+            typename = typename std::enable_if<
+                _Space == Space &&
+                Space == access::address_space::global_space>::type>
+  multi_ptr(
+      accessor<ElementType, dimensions, Mode, access::target::global_buffer>
+          Accessor)
+      : multi_ptr(Accessor.get_pointer()) {}
+
+  // Only if Space == local_space
+  template <
+      typename ElementType, int dimensions, access::mode Mode,
+      access::address_space _Space = Space,
+      typename = typename std::enable_if<
+          _Space == Space && Space == access::address_space::local_space>::type>
+  multi_ptr(
+      accessor<ElementType, dimensions, Mode, access::target::local> Accessor)
+      : multi_ptr(Accessor.get_pointer()) {}
+
+  // Only if Space == constant_space
+  template <typename ElementType, int dimensions, access::mode Mode,
+            access::address_space _Space = Space,
+            typename = typename std::enable_if<
+                _Space == Space &&
+                Space == access::address_space::constant_space>::type>
+  multi_ptr(
+      accessor<ElementType, dimensions, Mode, access::target::constant_buffer>
+          Accessor)
+      : multi_ptr(Accessor.get_pointer()) {}
+
+  // Returns the underlying OpenCL C pointer
+  pointer_t get() const { return m_Pointer; }
+
+  // Implicit conversion to the underlying pointer type
+  operator void *() const;
+
+  // Explicit conversion to a multi_ptr<ElementType>
+  template <typename ElementType>
+  explicit operator multi_ptr<ElementType, Space>() const;
+
+private:
+  pointer_t m_Pointer;
+};
+
+template <typename ElementType, access::address_space Space>
+multi_ptr<ElementType, Space>
+make_ptr(typename multi_ptr<ElementType, Space>::pointer_t pointer) {
+  return multi_ptr<ElementType, Space>(pointer);
+}
+
+#ifdef __SYCL_DEVICE_ONLY__
+// An implementation should reject an argument if the deduced address space
+// is not compatible with Space.
+// This is guaranteed by the c'tor.
+template <typename ElementType, access::address_space Space>
+multi_ptr<ElementType, Space> make_ptr(ElementType *pointer) {
+  return multi_ptr<ElementType, Space>(pointer);
+}
+#endif
+
+template <typename ElementType, access::address_space Space>
+bool operator==(const multi_ptr<ElementType, Space> &lhs,
+                const multi_ptr<ElementType, Space> &rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator!=(const multi_ptr<ElementType, Space> &lhs,
+                const multi_ptr<ElementType, Space> &rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator<(const multi_ptr<ElementType, Space> &lhs,
+               const multi_ptr<ElementType, Space> &rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator>(const multi_ptr<ElementType, Space> &lhs,
+               const multi_ptr<ElementType, Space> &rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator<=(const multi_ptr<ElementType, Space> &lhs,
+                const multi_ptr<ElementType, Space> &rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator>=(const multi_ptr<ElementType, Space> &lhs,
+                const multi_ptr<ElementType, Space> &rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator!=(const multi_ptr<ElementType, Space> &lhs, std::nullptr_t rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator!=(std::nullptr_t lhs, const multi_ptr<ElementType, Space> &rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator==(const multi_ptr<ElementType, Space> &lhs, std::nullptr_t rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator==(std::nullptr_t lhs, const multi_ptr<ElementType, Space> &rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator>(const multi_ptr<ElementType, Space> &lhs, std::nullptr_t rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator>(std::nullptr_t lhs, const multi_ptr<ElementType, Space> &rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator<(const multi_ptr<ElementType, Space> &lhs, std::nullptr_t rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator<(std::nullptr_t lhs, const multi_ptr<ElementType, Space> &rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator>=(const multi_ptr<ElementType, Space> &lhs, std::nullptr_t rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator>=(std::nullptr_t lhs, const multi_ptr<ElementType, Space> &rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator<=(const multi_ptr<ElementType, Space> &lhs, std::nullptr_t rhs);
+
+template <typename ElementType, access::address_space Space>
+bool operator<=(std::nullptr_t lhs, const multi_ptr<ElementType, Space> &rhs);
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/nd_item.hpp b/sycl/include/CL/sycl/nd_item.hpp
new file mode 100644
index 000000000000..5d34652dbcae
--- /dev/null
+++ b/sycl/include/CL/sycl/nd_item.hpp
@@ -0,0 +1,180 @@
+//==--------- nd_item.hpp --- SYCL iteration nd_item -----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/group.hpp>
+#include <CL/sycl/id.hpp>
+#include <CL/sycl/intel/sub_group.hpp>
+#include <CL/sycl/item.hpp>
+#include <CL/sycl/nd_range.hpp>
+#include <CL/sycl/range.hpp>
+#include <CL/__spirv/spirv_ops.hpp>
+#include <stdexcept>
+#include <type_traits>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+class Builder;
+}
+template <int dimensions = 1> struct nd_item {
+
+  nd_item() = delete;
+
+  id<dimensions> get_global_id() const { return globalItem.get_id(); }
+
+  size_t get_global_id(int dimension) const {
+    return globalItem.get_id(dimension);
+  }
+
+  size_t get_global_linear_id() const { return globalItem.get_linear_id(); }
+
+  id<dimensions> get_local_id() const { return localItem.get_id(); }
+
+  size_t get_local_id(int dimension) const {
+    return localItem.get_id(dimension);
+  }
+
+  size_t get_local_linear_id() const { return localItem.get_linear_id(); }
+
+  group<dimensions> get_group() const { return Group; }
+
+  intel::sub_group get_sub_group() const { return intel::sub_group(); }
+
+  size_t get_group(int dimension) const { return Group[dimension]; }
+
+  size_t get_group_linear_id() const { return Group.get_linear(); }
+
+  range<dimensions> get_group_range() const {
+    return Group.get_global_range() / Group.get_local_range();
+  }
+
+  size_t get_group_range(int dimension) const {
+    return Group.get_global_range(dimension) / Group.get_local_range(dimension);
+  }
+
+  range<dimensions> get_global_range() const { return globalItem.get_range(); }
+
+  size_t get_global_range(int dimension) const {
+    return globalItem.get_range(dimension);
+  }
+
+  range<dimensions> get_local_range() const { return localItem.get_range(); }
+
+  size_t get_local_range(int dimension) const {
+    return localItem.get_range(dimension);
+  }
+
+  id<dimensions> get_offset() const { return globalItem.get_offset(); }
+
+  nd_range<dimensions> get_nd_range() const {
+    return nd_range<dimensions>(get_global_range(), get_local_range(),
+                                get_offset());
+  }
+
+  void barrier(access::fence_space accessSpace =
+                   access::fence_space::global_and_local) const {
+    uint32_t flags = ::cl::__spirv::MemorySemantics::SequentiallyConsistent;
+    switch (accessSpace) {
+    case access::fence_space::global_space:
+      flags |= cl::__spirv::MemorySemantics::CrossWorkgroupMemory;
+      break;
+    case access::fence_space::local_space:
+      flags |= cl::__spirv::MemorySemantics::WorkgroupMemory;
+      break;
+    case access::fence_space::global_and_local:
+    default:
+      flags |= cl::__spirv::MemorySemantics::CrossWorkgroupMemory |
+               cl::__spirv::MemorySemantics::WorkgroupMemory;
+      break;
+    }
+    cl::__spirv::OpControlBarrier(::cl::__spirv::Scope::Workgroup,
+                                  ::cl::__spirv::Scope::Workgroup, flags);
+  }
+
+  /// Executes a work-group mem-fence with memory ordering on the local address
+  /// space, global address space or both based on the value of \p accessSpace.
+  template <access::mode accessMode = access::mode::read_write>
+  void
+  mem_fence(typename std::enable_if<accessMode == access::mode::read ||
+                                        accessMode == access::mode::write ||
+                                        accessMode == access::mode::read_write,
+                                    access::fence_space>::type accessSpace =
+                access::fence_space::global_and_local) const {
+    Group.mem_fence();
+  }
+
+  template<typename dataT>
+  device_event async_work_group_copy(local_ptr<dataT> dest,
+                                     global_ptr<dataT> src,
+                                     size_t numElements) const {
+    return Group.async_work_group_copy(dest, src, numElements);
+  }
+
+  template<typename dataT>
+  device_event async_work_group_copy(global_ptr<dataT> dest,
+                                     local_ptr<dataT> src,
+                                     size_t numElements) const {
+    return Group.async_work_group_copy(dest, src, numElements);
+  }
+
+  template<typename dataT>
+  device_event async_work_group_copy(local_ptr<dataT> dest,
+                                     global_ptr<dataT> src,
+                                     size_t numElements,
+                                     size_t srcStride) const {
+
+    return Group.async_work_group_copy(dest, src, numElements, srcStride);
+  }
+
+  template<typename dataT>
+  device_event async_work_group_copy(global_ptr<dataT> dest,
+                                     local_ptr<dataT> src,
+                                     size_t numElements,
+                                     size_t destStride) const {
+    return Group.async_work_group_copy(dest, src, numElements, destStride);
+  }
+
+  template<typename... eventTN>
+  void wait_for(eventTN... events) const {
+    Group.wait_for(events...);
+  }
+
+  nd_item<dimensions>(const nd_item<dimensions> &rhs) = default;
+
+  nd_item<dimensions>(nd_item<dimensions> &&rhs) = default;
+
+  nd_item<dimensions> &operator=(const nd_item<dimensions> &rhs) = default;
+
+  nd_item<dimensions> &operator=(nd_item<dimensions> &&rhs) = default;
+
+  bool operator==(const nd_item<dimensions> &rhs) const {
+    return (rhs.localItem == this->localItem) &&
+           (rhs.globalItem == this->globalItem) && (rhs.Group == this->Group);
+  }
+
+  bool operator!=(const nd_item<dimensions> &rhs) const {
+    return !((*this) == rhs);
+  }
+
+protected:
+  friend class detail::Builder;
+  nd_item(const item<dimensions, true> &GL, const item<dimensions, false> &L,
+          const group<dimensions> &GR)
+      : globalItem(GL), localItem(L), Group(GR) {}
+
+private:
+  item<dimensions, false> localItem;
+  item<dimensions, true> globalItem;
+  group<dimensions> Group;
+};
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/nd_range.hpp b/sycl/include/CL/sycl/nd_range.hpp
new file mode 100644
index 000000000000..6520309792b8
--- /dev/null
+++ b/sycl/include/CL/sycl/nd_range.hpp
@@ -0,0 +1,59 @@
+//==-------- nd_range.hpp --- SYCL iteration nd_range ----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/id.hpp>
+#include <CL/sycl/range.hpp>
+#include <stdexcept>
+#include <type_traits>
+
+namespace cl {
+namespace sycl {
+
+template <int dimensions = 1> class nd_range {
+  range<dimensions> globalSize;
+  range<dimensions> localSize;
+  id<dimensions> offset;
+
+public:
+  template <int N = dimensions>
+  nd_range(
+      typename std::enable_if<((N > 0) && (N < 4)), range<dimensions>>::type globalSize,
+      range<dimensions> localSize, id<dimensions> offset = id<dimensions>())
+      : globalSize(globalSize), localSize(localSize), offset(offset) {}
+
+  range<dimensions> get_global_range() const { return globalSize; }
+
+  range<dimensions> get_local_range() const { return localSize; }
+
+  range<dimensions> get_group_range() const { return globalSize / localSize; }
+
+  id<dimensions> get_offset() const { return offset; }
+
+  // Common special member functions for by-value semantics
+  nd_range(const nd_range<dimensions> &rhs) = default;
+  nd_range(nd_range<dimensions> &&rhs) = default;
+  nd_range<dimensions> &operator=(const nd_range<dimensions> &rhs) = default;
+  nd_range<dimensions> &operator=(nd_range<dimensions> &&rhs) = default;
+  nd_range() = default;
+
+  // Common member functions for by-value semantics
+  bool operator==(const nd_range<dimensions> &rhs) const {
+    return (rhs.globalSize == this->globalSize) &&
+           (rhs.localSize == this->localSize) && (rhs.offset == this->offset);
+  }
+
+  bool operator!=(const nd_range<dimensions> &rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/platform.hpp b/sycl/include/CL/sycl/platform.hpp
new file mode 100644
index 000000000000..d42ffbf5b882
--- /dev/null
+++ b/sycl/include/CL/sycl/platform.hpp
@@ -0,0 +1,80 @@
+//==---------------- platform.hpp - SYCL platform --------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/platform_impl.hpp>
+#include <CL/sycl/stl.hpp>
+// 4.6.2 Platform class
+#include <memory>
+#include <utility>
+namespace cl {
+namespace sycl {
+
+// TODO: make code thread-safe
+
+// Forward declaration
+class device_selector;
+class device;
+
+class platform {
+public:
+  platform();
+
+  explicit platform(cl_platform_id platform_id);
+
+  explicit platform(const device_selector &);
+
+  template <info::platform param>
+  typename info::param_traits<info::platform, param>::return_type
+  get_info() const {
+    return impl->get_info<param>();
+  }
+
+  platform(const platform &rhs) = default;
+
+  platform(platform &&rhs) = default;
+
+  platform &operator=(const platform &rhs) = default;
+
+  platform &operator=(platform &&rhs) = default;
+
+  bool operator==(const platform &rhs) const { return impl == rhs.impl; }
+
+  bool operator!=(const platform &rhs) const { return !(*this == rhs); }
+
+  cl_platform_id get() const { return impl->get(); }
+
+  bool has_extension(const string_class &extension_name) const {
+    return impl->has_extension(extension_name);
+  }
+
+  bool is_host() const { return impl->is_host(); }
+
+  vector_class<device>
+  get_devices(info::device_type dev_type = info::device_type::all) const;
+
+  static vector_class<platform> get_platforms();
+
+private:
+  std::shared_ptr<detail::platform_impl> impl;
+  template <class T>
+  friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject);
+}; // class platform
+} // namespace sycl
+} // namespace cl
+
+namespace std {
+template <> struct hash<cl::sycl::platform> {
+  size_t operator()(const cl::sycl::platform &p) const {
+    return hash<std::shared_ptr<cl::sycl::detail::platform_impl>>()(
+        cl::sycl::detail::getSyclObjImpl(p));
+  }
+};
+} // namespace std
diff --git a/sycl/include/CL/sycl/pointers.hpp b/sycl/include/CL/sycl/pointers.hpp
new file mode 100644
index 000000000000..f8077cadd6a2
--- /dev/null
+++ b/sycl/include/CL/sycl/pointers.hpp
@@ -0,0 +1,35 @@
+//==------------ pointers.hpp - SYCL pointers classes ----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/access/access.hpp>
+
+
+namespace cl {
+namespace sycl {
+
+template <typename ElementType, access::address_space Space> class multi_ptr;
+// Template specialization aliases for different pointer address spaces
+
+template <typename ElementType>
+using global_ptr = multi_ptr<ElementType, access::address_space::global_space>;
+
+template <typename ElementType>
+using local_ptr = multi_ptr<ElementType, access::address_space::local_space>;
+
+template <typename ElementType>
+using constant_ptr =
+    multi_ptr<ElementType, access::address_space::constant_space>;
+
+template <typename ElementType>
+using private_ptr =
+    multi_ptr<ElementType, access::address_space::private_space>;
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/program.hpp b/sycl/include/CL/sycl/program.hpp
new file mode 100644
index 000000000000..5db53194f22e
--- /dev/null
+++ b/sycl/include/CL/sycl/program.hpp
@@ -0,0 +1,145 @@
+//==--------------- program.hpp --- SYCL program ---------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/program_impl.hpp>
+#include <CL/sycl/info/info_desc.hpp>
+#include <CL/sycl/stl.hpp>
+
+#include <memory>
+
+namespace cl {
+namespace sycl {
+
+class context;
+class device;
+class kernel;
+
+class program {
+  template <class T>
+  friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject);
+  template <class T>
+  friend T detail::createSyclObjFromImpl(decltype(T::impl) ImplObj);
+
+public:
+  program() = delete;
+
+  explicit program(const context &context)
+      : impl(std::make_shared<detail::program_impl>(context)) {}
+
+  program(const context &context, vector_class<device> deviceList)
+      : impl(std::make_shared<detail::program_impl>(context, deviceList)) {}
+
+  program(vector_class<program> programList, string_class linkOptions = "") {
+    std::vector<std::shared_ptr<detail::program_impl>> impls;
+    for (auto &x : programList) {
+      impls.push_back(detail::getSyclObjImpl(x));
+    }
+    impl = std::make_shared<detail::program_impl>(impls, linkOptions);
+  }
+
+  program(const context &context, cl_program clProgram)
+      : impl(std::make_shared<detail::program_impl>(context, clProgram)) {}
+
+  program(const program &rhs) = default;
+
+  program(program &&rhs) = default;
+
+  program &operator=(const program &rhs) = default;
+
+  program &operator=(program &&rhs) = default;
+
+  bool operator==(const program &rhs) const { return impl == rhs.impl; }
+
+  bool operator!=(const program &rhs) const { return !operator==(rhs); }
+
+  cl_program get() const { return impl->get(); }
+
+  bool is_host() const { return impl->is_host(); }
+
+  template <typename kernelT>
+  void compile_with_kernel_type(string_class compileOptions = "") {
+    impl->compile_with_kernel_type<kernelT>(compileOptions);
+  }
+
+  void compile_with_source(string_class kernelSource,
+                           string_class compileOptions = "") {
+    impl->compile_with_source(kernelSource, compileOptions);
+  }
+
+  template <typename kernelT>
+  void build_with_kernel_type(string_class buildOptions = "") {
+    impl->build_with_kernel_type<kernelT>(buildOptions);
+  }
+
+  void build_with_source(string_class kernelSource,
+                         string_class buildOptions = "") {
+    impl->build_with_source(kernelSource, buildOptions);
+  }
+
+  void link(string_class linkOptions = "") { impl->link(linkOptions); }
+
+  template <typename kernelT> bool has_kernel() const {
+    return impl->has_kernel<kernelT>();
+  }
+
+  bool has_kernel(string_class kernelName) const {
+    return impl->has_kernel(kernelName);
+  }
+
+  template <typename kernelT> kernel get_kernel() const {
+    return impl->get_kernel<kernelT>(impl);
+  }
+
+  kernel get_kernel(string_class kernelName) const {
+    return impl->get_kernel(kernelName, impl);
+  }
+
+  template <info::program param>
+  typename info::param_traits<info::program, param>::return_type
+  get_info() const {
+    return impl->get_info<param>();
+  }
+
+  vector_class<vector_class<char>> get_binaries() const {
+    return impl->get_binaries();
+  }
+
+  context get_context() const { return impl->get_context(); }
+
+  vector_class<device> get_devices() const { return impl->get_devices(); }
+
+  string_class get_compile_options() const {
+    return impl->get_compile_options();
+  }
+
+  string_class get_link_options() const { return impl->get_link_options(); }
+
+  string_class get_build_options() const { return impl->get_build_options(); }
+
+  program_state get_state() const { return impl->get_state(); }
+
+private:
+  program(std::shared_ptr<detail::program_impl> impl) : impl(impl) {}
+
+  std::shared_ptr<detail::program_impl> impl;
+};
+} // namespace sycl
+} // namespace cl
+
+namespace std {
+template <> struct hash<cl::sycl::program> {
+  size_t operator()(const cl::sycl::program &prg) const {
+    return hash<std::shared_ptr<cl::sycl::detail::program_impl>>()(
+        cl::sycl::detail::getSyclObjImpl(prg));
+  }
+};
+} // namespace std
diff --git a/sycl/include/CL/sycl/property_list.hpp b/sycl/include/CL/sycl/property_list.hpp
new file mode 100644
index 000000000000..706be7ea2a04
--- /dev/null
+++ b/sycl/include/CL/sycl/property_list.hpp
@@ -0,0 +1,249 @@
+//==--------- property_list.hpp --- SYCL property list ---------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace cl {
+namespace sycl {
+// Forward declaration
+class context;
+
+// HOW TO ADD NEW PROPERTY INSTRUCTION:
+// 1. Add forward declaration of property class.
+// 2. Add new record in PropKind enum.
+// 3. Use RegisterProp macro passing new record from enum and new class.
+// 4. Add implementation of the new property class using detail::Prop class with
+//    template parameter = new record in enum as a base class.
+
+namespace property {
+
+namespace image {
+class use_host_ptr;
+class use_mutex;
+class context_bound;
+} // namespace image
+
+namespace buffer {
+class use_host_ptr;
+class use_mutex;
+class context_bound;
+} // namespace buffer
+
+namespace queue {
+class enable_profiling;
+} // namespace queue
+
+namespace detail {
+
+// List of all properties' IDs.
+enum PropKind {
+  // Buffer properties
+  BufferUseHostPtr = 0,
+  BufferContextBound,
+  BufferUseMutex,
+
+  // Image properties
+  ImageUseHostPtr,
+  ImageContextBound,
+  ImageUseMutex,
+
+  // Queue properties
+  QueueEnableProfiling,
+
+  PropKindSize
+};
+
+// Base class for all properties. Needed to check that user passed only
+// SYCL's properties to property_list c'tor.
+class PropBase {};
+
+// Second base class, needed for mapping PropKind to class and vice versa.
+template <PropKind PropKindT> class Prop;
+
+// This class is used in property_list to hold properties.
+template <class T> class PropertyHolder {
+public:
+  void setProp(const T &Rhs) {
+    new (m_Mem) T(Rhs);
+    m_Initialized = true;
+  }
+
+  const T &getProp() const {
+    assert(true == m_Initialized && "Property was not set!");
+    return *(T *)m_Mem;
+  }
+  bool isInitialized() const { return m_Initialized; }
+
+private:
+  // Memory that is used for property allocation
+  unsigned char m_Mem[sizeof(T)];
+  // Indicate whether property initialized or not.
+  bool m_Initialized = false;
+};
+
+// This macro adds specialization of class Prop which provides possibility to
+// convert PropKind to class and vice versa.
+#define RegisterProp(PropKindT, Type)                                          \
+  template <> class Prop<PropKindT> : public PropBase {                        \
+  public:                                                                      \
+    static constexpr PropKind getKind() { return PropKindT; }                  \
+    using FinalType = Type;                                                    \
+  }
+
+// Image
+RegisterProp(PropKind::ImageUseHostPtr, image::use_host_ptr);
+RegisterProp(PropKind::ImageUseMutex, image::use_mutex);
+RegisterProp(PropKind::ImageContextBound, image::context_bound);
+
+// Buffer
+RegisterProp(PropKind::BufferUseHostPtr, buffer::use_host_ptr);
+RegisterProp(PropKind::BufferUseMutex, buffer::use_mutex);
+RegisterProp(PropKind::BufferContextBound, buffer::context_bound);
+
+// Queue
+RegisterProp(PropKind::QueueEnableProfiling, queue::enable_profiling);
+
+// Sentinel, needed for automatic build of tuple in property_list.
+RegisterProp(PropKind::PropKindSize, PropBase);
+
+// Common class for use_mutex in buffer and image namespaces.
+template <PropKind PropKindT> class UseMutexBase : public Prop<PropKindT> {
+public:
+  UseMutexBase(mutex_class &MutexRef) : m_MutexClass(MutexRef) {}
+  mutex_class *get_mutex_ptr() const { return &m_MutexClass; }
+
+private:
+  mutex_class &m_MutexClass;
+};
+
+// Common class for context_bound in buffer and image namespaces.
+template <PropKind PropKindT> class ContextBoundBase : public Prop<PropKindT> {
+public:
+  ContextBoundBase(cl::sycl::context Context) : m_Context(Context) {}
+  context get_context() const { return m_Context; }
+
+private:
+  cl::sycl::context m_Context;
+};
+} // namespace detail
+
+namespace image {
+
+class use_host_ptr : public detail::Prop<detail::PropKind::ImageUseHostPtr> {};
+
+class use_mutex : public detail::UseMutexBase<detail::PropKind::ImageUseMutex> {
+public:
+  use_mutex(mutex_class &MutexRef) : UseMutexBase(MutexRef) {}
+};
+
+class context_bound
+    : public detail::ContextBoundBase<detail::PropKind::ImageContextBound> {
+public:
+  context_bound(cl::sycl::context Context) : ContextBoundBase(Context) {}
+};
+
+} // namespace image
+
+namespace buffer {
+
+class use_host_ptr : public detail::Prop<detail::PropKind::BufferUseHostPtr> {};
+
+class use_mutex
+    : public detail::UseMutexBase<detail::PropKind::BufferUseMutex> {
+public:
+  use_mutex(mutex_class &MutexRef) : UseMutexBase(MutexRef) {}
+};
+
+class context_bound
+    : public detail::ContextBoundBase<detail::PropKind::BufferContextBound> {
+public:
+  context_bound(cl::sycl::context Context) : ContextBoundBase(Context) {}
+};
+} // namespace buffer
+
+namespace queue {
+class enable_profiling
+    : public detail::Prop<detail::PropKind::QueueEnableProfiling> {};
+} // namespace queue
+
+} // namespace property
+
+class property_list {
+
+  // The structs validate that all objects passed are base of PropBase class.
+  template <typename... Tail> struct AllProperties : std::true_type {};
+  template <typename T, typename... Tail>
+  struct AllProperties<T, Tail...>
+      : std::conditional<std::is_base_of<property::detail::PropBase, T>::value,
+                         AllProperties<Tail...>, std::false_type>::type {};
+
+  template <class T>
+  using PropertyHolder = cl::sycl::property::detail::PropertyHolder<T>;
+  template <property::detail::PropKind PropKindT>
+  using Property = cl::sycl::property::detail::Prop<PropKindT>;
+
+  // The structs build tuple type that can hold all properties.
+  template <typename... Head> struct DefineTupleType {
+    using Type = std::tuple<Head...>;
+  };
+
+  template <int Counter, typename... Head>
+  struct BuildTupleType
+      : public std::conditional<
+            (Counter < property::detail::PropKind::PropKindSize),
+            BuildTupleType<
+                Counter + 1, Head...,
+                PropertyHolder<typename Property<(property::detail::PropKind)(
+                    Counter)>::FinalType>>,
+            DefineTupleType<Head...>>::type {};
+
+public:
+  // C'tor initialize m_PropList with properties passed by invoking ctorHelper
+  // recursively
+  template <typename... propertyTN,
+            typename = typename std::enable_if<
+                AllProperties<propertyTN...>::value>::type>
+  property_list(propertyTN... Props) {
+    ctorHelper(Props...);
+  }
+
+  template <typename propertyT> propertyT get_property() const {
+    static_assert((int)(propertyT::getKind()) <=
+                      property::detail::PropKind::PropKindSize,
+                  "Invalid option passed.");
+    const auto &PropHolder = std::get<(int)(propertyT::getKind())>(m_PropsList);
+    if (PropHolder.isInitialized()) {
+      return PropHolder.getProp();
+    }
+    throw invalid_object_error();
+  }
+
+  template <typename propertyT> bool has_property() const {
+    return std::get<(int)(propertyT::getKind())>(m_PropsList).isInitialized();
+  }
+
+private:
+  void ctorHelper() {}
+
+  template <typename... propertyTN, class PropT>
+  void ctorHelper(PropT &Prop, propertyTN... props) {
+    std::get<(int)(PropT::getKind())>(m_PropsList).setProp(Prop);
+    ctorHelper(props...);
+  }
+
+  // Tuple that able to hold all the properties.
+  BuildTupleType<0>::Type m_PropsList;
+};
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
new file mode 100644
index 000000000000..824230607cf3
--- /dev/null
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -0,0 +1,122 @@
+//==-------------------- queue.hpp - SYCL queue ----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/queue_impl.hpp>
+#include <CL/sycl/info/info_desc.hpp>
+#include <CL/sycl/property_list.hpp>
+
+#include <memory>
+#include <utility>
+
+namespace cl {
+namespace sycl {
+
+// Forward declaration
+class context;
+class device;
+class queue {
+public:
+  explicit queue(const property_list &propList = {})
+      : queue(default_selector(), async_handler{}, propList) {}
+
+  queue(const async_handler &asyncHandler, const property_list &propList = {})
+      : queue(default_selector(), asyncHandler, propList) {}
+
+  queue(const device_selector &deviceSelector,
+        const property_list &propList = {})
+      : queue(deviceSelector.select_device(), async_handler{}, propList) {}
+
+  queue(const device_selector &deviceSelector,
+        const async_handler &asyncHandler, const property_list &propList = {})
+      : queue(deviceSelector.select_device(), asyncHandler, propList) {}
+
+  queue(const device &syclDevice, const property_list &propList = {})
+      : queue(syclDevice, async_handler{}, propList) {}
+
+  queue(const device &syclDevice, const async_handler &asyncHandler,
+        const property_list &propList = {});
+
+  queue(const context &syclContext, const device_selector &deviceSelector,
+        const property_list &propList = {})
+      : queue(syclContext, deviceSelector,
+              detail::getSyclObjImpl(syclContext)->get_async_handler(),
+              propList) {}
+
+  queue(const context &syclContext, const device_selector &deviceSelector,
+        const async_handler &asyncHandler, const property_list &propList = {});
+
+  queue(cl_command_queue clQueue, const context &syclContext,
+        const async_handler &asyncHandler = {});
+
+  queue(const queue &rhs) = default;
+
+  queue(queue &&rhs) = default;
+
+  queue &operator=(const queue &rhs) = default;
+
+  queue &operator=(queue &&rhs) = default;
+
+  bool operator==(const queue &rhs) const { return impl == rhs.impl; }
+
+  bool operator!=(const queue &rhs) const { return !(*this == rhs); }
+
+  cl_command_queue get() const { return impl->get(); }
+
+  context get_context() const { return impl->get_context(); }
+
+  device get_device() const { return impl->get_device(); }
+
+  bool is_host() const { return impl->is_host(); }
+
+  template <info::queue param>
+  typename info::param_traits<info::queue, param>::return_type
+  get_info() const {
+    return impl->get_info<param>();
+  }
+
+  template <typename T> event submit(T cgf) { return impl->submit(cgf, impl); }
+
+  template <typename T> event submit(T cgf, queue &secondaryQueue) {
+    return impl->submit(cgf, impl, secondaryQueue.impl);
+  }
+
+  void wait() { impl->wait(); }
+
+  void wait_and_throw() { impl->wait_and_throw(); }
+
+  void throw_asynchronous() { impl->throw_asynchronous(); }
+
+  template <typename propertyT> bool has_property() const {
+    return impl->has_property<propertyT>();
+  }
+
+  template <typename propertyT> propertyT get_property() const {
+    return impl->get_property<propertyT>();
+  }
+
+private:
+  std::shared_ptr<detail::queue_impl> impl;
+  template <class T>
+  friend decltype(T::impl) detail::getSyclObjImpl(const T &SyclObject);
+};
+
+} // namespace sycl
+} // namespace cl
+
+namespace std {
+template <> struct hash<cl::sycl::queue> {
+  size_t operator()(const cl::sycl::queue &q) const {
+    return std::hash<std::shared_ptr<cl::sycl::detail::queue_impl>>()(
+        cl::sycl::detail::getSyclObjImpl(q));
+  }
+};
+} // namespace std
diff --git a/sycl/include/CL/sycl/range.hpp b/sycl/include/CL/sycl/range.hpp
new file mode 100644
index 000000000000..6d7f4919f66e
--- /dev/null
+++ b/sycl/include/CL/sycl/range.hpp
@@ -0,0 +1,544 @@
+//==----------- range.hpp --- SYCL iteration range -------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl/detail/array.hpp>
+#include <stdexcept>
+#include <type_traits>
+
+namespace cl {
+namespace sycl {
+template <int dimensions> struct id;
+template <int dimensions = 1>
+class range : public detail::array<dimensions> {
+public:
+  using base = detail::array<dimensions>;
+  /* The following constructor is only available in the range class
+  specialization where: dimensions==1 */
+  template <int N = dimensions>
+  range(typename std::enable_if<(N == 1), size_t>::type dim0) : base(dim0) {}
+
+  /* The following constructor is only available in the range class
+  specialization where: dimensions==2 */
+  template <int N = dimensions>
+  range(typename std::enable_if<(N == 2), size_t>::type dim0, size_t dim1)
+      : base(dim0, dim1) {}
+
+  /* The following constructor is only available in the range class
+  specialization where: dimensions==3 */
+  template <int N = dimensions>
+  range(typename std::enable_if<(N == 3), size_t>::type dim0, size_t dim1,
+        size_t dim2) : base(dim0, dim1, dim2) {}
+
+  explicit operator id<dimensions>() const {
+    id<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result[i] = this->get(i);
+    }
+    return result;
+  }
+
+  size_t size() const {
+    size_t size = 1;
+    for (int i = 0; i < dimensions; ++i) {
+      size *= this->get(i);
+    }
+    return size;
+  }
+
+  range(const range<dimensions> &rhs) = default;
+  range(range<dimensions> &&rhs) = default;
+  range<dimensions> &operator=(const range<dimensions> &rhs) = default;
+  range<dimensions> &operator=(range<dimensions> &&rhs) = default;
+  range() = default;
+
+  // OP is: +, -, *, /, %, <<, >>, &, |, ^, &&, ||, <, >, <=, >=
+  range<dimensions> operator+(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] + rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator-(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] - rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator*(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] * rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator/(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] / rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator%(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] % rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator<<(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] << rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator>>(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] >> rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator&(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] & rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator|(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] | rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator^(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] ^ rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator&&(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] && rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator||(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] || rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator<(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] < rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator>(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] > rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator<=(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] <= rhs.common_array[i];
+    }
+    return result;
+  }
+  range<dimensions> operator>=(const range<dimensions> &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] >= rhs.common_array[i];
+    }
+    return result;
+  }
+
+  // OP is: +, -, *, /, %, <<, >>, &, |, ^, &&, ||, <, >, <=, >=
+  range<dimensions> operator+(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] + rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator-(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] - rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator*(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] * rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator/(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] / rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator%(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] % rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator<<(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] << rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator>>(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] >> rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator&(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] & rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator|(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] | rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator^(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] ^ rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator&&(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] && rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator||(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] || rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator<(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] < rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator>(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] > rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator<=(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] <= rhs;
+    }
+    return result;
+  }
+  range<dimensions> operator>=(const size_t &rhs) const {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = this->common_array[i] >= rhs;
+    }
+    return result;
+  }
+
+  // OP is: +=, -=, *=, /=, %=, <<=, >>=, &=, |=, ^=
+  range<dimensions> &operator+=(const range<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] += rhs[i];
+    }
+    return *this;
+  }
+  range<dimensions> &operator-=(const range<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] -= rhs.common_array[i];
+    }
+    return *this;
+  }
+  range<dimensions> &operator*=(const range<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] *= rhs.common_array[i];
+    }
+    return *this;
+  }
+  range<dimensions> &operator/=(const range<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] /= rhs.common_array[i];
+    }
+    return *this;
+  }
+  range<dimensions> &operator%=(const range<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] %= rhs.common_array[i];
+    }
+    return *this;
+  }
+  range<dimensions> &operator<<=(const range<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] <<= rhs.common_array[i];
+    }
+    return *this;
+  }
+  range<dimensions> &operator>>=(const range<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] >>= rhs.common_array[i];
+    }
+    return *this;
+  }
+  range<dimensions> &operator&=(const range<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] &= rhs.common_array[i];
+    }
+    return *this;
+  }
+  range<dimensions> &operator|=(const range<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] |= rhs.common_array[i];
+    }
+    return *this;
+  }
+  range<dimensions> &operator^=(const range<dimensions> &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] ^= rhs.common_array[i];
+    }
+    return *this;
+  }
+
+  // OP is: +=, -=, *=, /=, %=, <<=, >>=, &=, |=, ^=
+  range<dimensions> &operator+=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] += rhs;
+    }
+    return *this;
+  }
+  range<dimensions> &operator-=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] -= rhs;
+    }
+    return *this;
+  }
+  range<dimensions> &operator*=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] *= rhs;
+    }
+    return *this;
+  }
+  range<dimensions> &operator/=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] /= rhs;
+    }
+    return *this;
+  }
+  range<dimensions> &operator%=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] %= rhs;
+    }
+    return *this;
+  }
+  range<dimensions> &operator<<=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] <<= rhs;
+    }
+    return *this;
+  }
+  range<dimensions> &operator>>=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] >>= rhs;
+    }
+    return *this;
+  }
+  range<dimensions> &operator&=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] &= rhs;
+    }
+    return *this;
+  }
+  range<dimensions> &operator|=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] |= rhs;
+    }
+    return *this;
+  }
+  range<dimensions> &operator^=(const size_t &rhs) {
+    for (int i = 0; i < dimensions; ++i) {
+      this->common_array[i] ^= rhs;
+    }
+    return *this;
+  }
+
+  // OP is: +, -, *, /, %, <<, >>, &, |, ^, <, >, <=, >=, &&, ||
+  friend range<dimensions> operator+(const size_t &lhs,
+                                     const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs + rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator-(const size_t &lhs,
+                                     const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs - rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator*(const size_t &lhs,
+                                     const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs * rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator/(const size_t &lhs,
+                                     const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs / rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator%(const size_t &lhs,
+                                     const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs % rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator<<(const size_t &lhs,
+                                      const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs << rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator>>(const size_t &lhs,
+                                      const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs >> rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator&(const size_t &lhs,
+                                     const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs & rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator|(const size_t &lhs,
+                                     const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs | rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator^(const size_t &lhs,
+                                     const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs ^ rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator<(const size_t &lhs,
+                                     const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs < rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator>(const size_t &lhs,
+                                     const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs > rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator<=(const size_t &lhs,
+                                      const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs <= rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator>=(const size_t &lhs,
+                                      const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs >= rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator&&(const size_t &lhs,
+                                      const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs && rhs.common_array[i];
+    }
+    return result;
+  }
+  friend range<dimensions> operator||(const size_t &lhs,
+                                      const range<dimensions> &rhs) {
+    range<dimensions> result;
+    for (int i = 0; i < dimensions; ++i) {
+      result.common_array[i] = lhs || rhs.common_array[i];
+    }
+    return result;
+  }
+};
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/stl.hpp b/sycl/include/CL/sycl/stl.hpp
new file mode 100644
index 000000000000..5322fee53550
--- /dev/null
+++ b/sycl/include/CL/sycl/stl.hpp
@@ -0,0 +1,50 @@
+//==----------- stl.hpp - basic STL implementation -------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+// 4.5 C++ Standard library classes required for the interface
+
+#include <exception>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+namespace cl {
+namespace sycl {
+
+ template < class T, class Alloc = std::allocator<T> >
+ using vector_class = std::vector<T, Alloc>;
+
+ using string_class = std::string;
+
+ template <class Sig>
+ using function_class = std::function<Sig>;
+
+ using mutex_class = std::mutex;
+
+ template <class T, class Deleter = std::default_delete<T>>
+ using unique_ptr_class = std::unique_ptr<T, Deleter>;
+
+ template <class T>
+ using shared_ptr_class = std::shared_ptr<T>;
+
+ template <class T>
+ using weak_ptr_class = std::weak_ptr<T>;
+
+ template <class T>
+ using hash_class = std::hash<T>;
+
+ using exception_ptr_class = std::exception_ptr;
+
+} // sycl
+} // cl
+
diff --git a/sycl/include/CL/sycl/swizzles.def b/sycl/include/CL/sycl/swizzles.def
new file mode 100644
index 000000000000..0c25d4d394b5
--- /dev/null
+++ b/sycl/include/CL/sycl/swizzles.def
@@ -0,0 +1,842 @@
+//==---------------- swizzles.def --- SYCL types ---------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// included to types.hppp twice, once for vec<> and once for SwizzleOp<>.
+
+// TODO: exclude L-Value swizzle like vec.xxxx()
+#ifdef __SYCL_ACCESS
+#error Undefine __SYCL_ACCESS macro.
+#endif
+#define __SYCL_ACCESS(_COND, _NAME, ...)                                       \
+  template <int N = getNumElements()>                                          \
+  typename std::enable_if<(_COND), Swizzle<__VA_ARGS__>>::type _NAME() {       \
+    return __SYCL_ACCESS_RETURN;                                               \
+  }                                                                            \
+  template <int N = getNumElements()>                                          \
+  typename std::enable_if<(_COND), ConstSwizzle<__VA_ARGS__>>::type _NAME()    \
+      const {                                                                  \
+    return __SYCL_ACCESS_RETURN;                                               \
+  }
+
+//__swizzled_vec__ XYZW_ACCESS() const;
+__SYCL_ACCESS(N <= 4, x, Indexer(0))
+__SYCL_ACCESS(N == 2 || N == 3 || N == 4, y, Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, z, Indexer(2))
+__SYCL_ACCESS(N == 4, w, Indexer(3))
+
+//__swizzled_vec__ RGBA_ACCESS() const;
+__SYCL_ACCESS(N == 4, r, Indexer(0))
+__SYCL_ACCESS(N == 4, g, Indexer(1))
+__SYCL_ACCESS(N == 4, b, Indexer(2))
+__SYCL_ACCESS(N == 4, a, Indexer(3))
+
+//__swizzled_vec__ INDEX_ACCESS() const;
+__SYCL_ACCESS(N > 0, s0, Indexer(0))
+__SYCL_ACCESS(N > 1, s1, Indexer(1))
+__SYCL_ACCESS(N > 2, s2, Indexer(2))
+__SYCL_ACCESS(N > 2, s3, Indexer(3))
+__SYCL_ACCESS(N > 4, s4, Indexer(4))
+__SYCL_ACCESS(N > 4, s5, Indexer(5))
+__SYCL_ACCESS(N > 4, s6, Indexer(6))
+__SYCL_ACCESS(N > 4, s7, Indexer(7))
+__SYCL_ACCESS(N == 16, s8, Indexer(8))
+__SYCL_ACCESS(N == 16, s9, Indexer(9))
+__SYCL_ACCESS(N == 16, sA, Indexer(10))
+__SYCL_ACCESS(N == 16, sB, Indexer(11))
+__SYCL_ACCESS(N == 16, sC, Indexer(12))
+__SYCL_ACCESS(N == 16, sD, Indexer(13))
+__SYCL_ACCESS(N == 16, sE, Indexer(14))
+__SYCL_ACCESS(N == 16, sF, Indexer(15))
+
+#ifdef SYCL_SIMPLE_SWIZZLES
+//__swizzled_vec__ XYZW_SWIZZLE() const;
+__SYCL_ACCESS(N <= 4, xx, Indexer(0), Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, xy, Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, xz, Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, xw, Indexer(0), Indexer(3))
+__SYCL_ACCESS(2 <= N && N <= 4, yx, Indexer(1), Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, yy, Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, yz, Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, yw, Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, zx, Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, zy, Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, zz, Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, zw, Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, wx, Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, wy, Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, wz, Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, ww, Indexer(3), Indexer(3))
+__SYCL_ACCESS(N <= 4, xxx, Indexer(0), Indexer(0), Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, xxy, Indexer(0), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, xxz, Indexer(0), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, xxw, Indexer(0), Indexer(0), Indexer(3))
+__SYCL_ACCESS(2 <= N && N <= 4, xyx, Indexer(0), Indexer(1), Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, xyy, Indexer(0), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, xyz, Indexer(0), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, xyw, Indexer(0), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, xzx, Indexer(0), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, xzy, Indexer(0), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, xzz, Indexer(0), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, xzw, Indexer(0), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, xwx, Indexer(0), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, xwy, Indexer(0), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, xwz, Indexer(0), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, xww, Indexer(0), Indexer(3), Indexer(3))
+__SYCL_ACCESS(2 <= N && N <= 4, yxx, Indexer(1), Indexer(0), Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, yxy, Indexer(1), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, yxz, Indexer(1), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, yxw, Indexer(1), Indexer(0), Indexer(3))
+__SYCL_ACCESS(2 <= N && N <= 4, yyx, Indexer(1), Indexer(1), Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, yyy, Indexer(1), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, yyz, Indexer(1), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, yyw, Indexer(1), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, yzx, Indexer(1), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, yzy, Indexer(1), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, yzz, Indexer(1), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, yzw, Indexer(1), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, ywx, Indexer(1), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, ywy, Indexer(1), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, ywz, Indexer(1), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, yww, Indexer(1), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, zxx, Indexer(2), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, zxy, Indexer(2), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, zxz, Indexer(2), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, zxw, Indexer(2), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, zyx, Indexer(2), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, zyy, Indexer(2), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, zyz, Indexer(2), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, zyw, Indexer(2), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, zzx, Indexer(2), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, zzy, Indexer(2), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, zzz, Indexer(2), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, zzw, Indexer(2), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, zwx, Indexer(2), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, zwy, Indexer(2), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, zwz, Indexer(2), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, zww, Indexer(2), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, wxx, Indexer(3), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, wxy, Indexer(3), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, wxz, Indexer(3), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, wxw, Indexer(3), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, wyx, Indexer(3), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, wyy, Indexer(3), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, wyz, Indexer(3), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, wyw, Indexer(3), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, wzx, Indexer(3), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, wzy, Indexer(3), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, wzz, Indexer(3), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, wzw, Indexer(3), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, wwx, Indexer(3), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, wwy, Indexer(3), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, wwz, Indexer(3), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, www, Indexer(3), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N <= 4, xxxx, Indexer(0), Indexer(0), Indexer(0), Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, xxxy, Indexer(0), Indexer(0), Indexer(0),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, xxxz, Indexer(0), Indexer(0), Indexer(0),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, xxxw, Indexer(0), Indexer(0), Indexer(0), Indexer(3))
+__SYCL_ACCESS(2 <= N && N <= 4, xxyx, Indexer(0), Indexer(0), Indexer(1),
+              Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, xxyy, Indexer(0), Indexer(0), Indexer(1),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, xxyz, Indexer(0), Indexer(0), Indexer(1),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, xxyw, Indexer(0), Indexer(0), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, xxzx, Indexer(0), Indexer(0), Indexer(2),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, xxzy, Indexer(0), Indexer(0), Indexer(2),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, xxzz, Indexer(0), Indexer(0), Indexer(2),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, xxzw, Indexer(0), Indexer(0), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, xxwx, Indexer(0), Indexer(0), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, xxwy, Indexer(0), Indexer(0), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, xxwz, Indexer(0), Indexer(0), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, xxww, Indexer(0), Indexer(0), Indexer(3), Indexer(3))
+__SYCL_ACCESS(2 <= N && N <= 4, xyxx, Indexer(0), Indexer(1), Indexer(0),
+              Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, xyxy, Indexer(0), Indexer(1), Indexer(0),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, xyxz, Indexer(0), Indexer(1), Indexer(0),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, xyxw, Indexer(0), Indexer(1), Indexer(0), Indexer(3))
+__SYCL_ACCESS(2 <= N && N <= 4, xyyx, Indexer(0), Indexer(1), Indexer(1),
+              Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, xyyy, Indexer(0), Indexer(1), Indexer(1),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, xyyz, Indexer(0), Indexer(1), Indexer(1),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, xyyw, Indexer(0), Indexer(1), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, xyzx, Indexer(0), Indexer(1), Indexer(2),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, xyzy, Indexer(0), Indexer(1), Indexer(2),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, xyzz, Indexer(0), Indexer(1), Indexer(2),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, xyzw, Indexer(0), Indexer(1), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, xywx, Indexer(0), Indexer(1), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, xywy, Indexer(0), Indexer(1), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, xywz, Indexer(0), Indexer(1), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, xyww, Indexer(0), Indexer(1), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, xzxx, Indexer(0), Indexer(2), Indexer(0),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, xzxy, Indexer(0), Indexer(2), Indexer(0),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, xzxz, Indexer(0), Indexer(2), Indexer(0),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, xzxw, Indexer(0), Indexer(2), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, xzyx, Indexer(0), Indexer(2), Indexer(1),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, xzyy, Indexer(0), Indexer(2), Indexer(1),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, xzyz, Indexer(0), Indexer(2), Indexer(1),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, xzyw, Indexer(0), Indexer(2), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, xzzx, Indexer(0), Indexer(2), Indexer(2),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, xzzy, Indexer(0), Indexer(2), Indexer(2),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, xzzz, Indexer(0), Indexer(2), Indexer(2),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, xzzw, Indexer(0), Indexer(2), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, xzwx, Indexer(0), Indexer(2), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, xzwy, Indexer(0), Indexer(2), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, xzwz, Indexer(0), Indexer(2), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, xzww, Indexer(0), Indexer(2), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, xwxx, Indexer(0), Indexer(3), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, xwxy, Indexer(0), Indexer(3), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, xwxz, Indexer(0), Indexer(3), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, xwxw, Indexer(0), Indexer(3), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, xwyx, Indexer(0), Indexer(3), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, xwyy, Indexer(0), Indexer(3), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, xwyz, Indexer(0), Indexer(3), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, xwyw, Indexer(0), Indexer(3), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, xwzx, Indexer(0), Indexer(3), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, xwzy, Indexer(0), Indexer(3), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, xwzz, Indexer(0), Indexer(3), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, xwzw, Indexer(0), Indexer(3), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, xwwx, Indexer(0), Indexer(3), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, xwwy, Indexer(0), Indexer(3), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, xwwz, Indexer(0), Indexer(3), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, xwww, Indexer(0), Indexer(3), Indexer(3), Indexer(3))
+__SYCL_ACCESS(2 <= N && N <= 4, yxxx, Indexer(1), Indexer(0), Indexer(0),
+              Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, yxxy, Indexer(1), Indexer(0), Indexer(0),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, yxxz, Indexer(1), Indexer(0), Indexer(0),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, yxxw, Indexer(1), Indexer(0), Indexer(0), Indexer(3))
+__SYCL_ACCESS(2 <= N && N <= 4, yxyx, Indexer(1), Indexer(0), Indexer(1),
+              Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, yxyy, Indexer(1), Indexer(0), Indexer(1),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, yxyz, Indexer(1), Indexer(0), Indexer(1),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, yxyw, Indexer(1), Indexer(0), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, yxzx, Indexer(1), Indexer(0), Indexer(2),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, yxzy, Indexer(1), Indexer(0), Indexer(2),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, yxzz, Indexer(1), Indexer(0), Indexer(2),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, yxzw, Indexer(1), Indexer(0), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, yxwx, Indexer(1), Indexer(0), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, yxwy, Indexer(1), Indexer(0), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, yxwz, Indexer(1), Indexer(0), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, yxww, Indexer(1), Indexer(0), Indexer(3), Indexer(3))
+__SYCL_ACCESS(2 <= N && N <= 4, yyxx, Indexer(1), Indexer(1), Indexer(0),
+              Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, yyxy, Indexer(1), Indexer(1), Indexer(0),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, yyxz, Indexer(1), Indexer(1), Indexer(0),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, yyxw, Indexer(1), Indexer(1), Indexer(0), Indexer(3))
+__SYCL_ACCESS(2 <= N && N <= 4, yyyx, Indexer(1), Indexer(1), Indexer(1),
+              Indexer(0))
+__SYCL_ACCESS(2 <= N && N <= 4, yyyy, Indexer(1), Indexer(1), Indexer(1),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, yyyz, Indexer(1), Indexer(1), Indexer(1),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, yyyw, Indexer(1), Indexer(1), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, yyzx, Indexer(1), Indexer(1), Indexer(2),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, yyzy, Indexer(1), Indexer(1), Indexer(2),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, yyzz, Indexer(1), Indexer(1), Indexer(2),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, yyzw, Indexer(1), Indexer(1), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, yywx, Indexer(1), Indexer(1), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, yywy, Indexer(1), Indexer(1), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, yywz, Indexer(1), Indexer(1), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, yyww, Indexer(1), Indexer(1), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, yzxx, Indexer(1), Indexer(2), Indexer(0),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, yzxy, Indexer(1), Indexer(2), Indexer(0),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, yzxz, Indexer(1), Indexer(2), Indexer(0),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, yzxw, Indexer(1), Indexer(2), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, yzyx, Indexer(1), Indexer(2), Indexer(1),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, yzyy, Indexer(1), Indexer(2), Indexer(1),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, yzyz, Indexer(1), Indexer(2), Indexer(1),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, yzyw, Indexer(1), Indexer(2), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, yzzx, Indexer(1), Indexer(2), Indexer(2),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, yzzy, Indexer(1), Indexer(2), Indexer(2),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, yzzz, Indexer(1), Indexer(2), Indexer(2),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, yzzw, Indexer(1), Indexer(2), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, yzwx, Indexer(1), Indexer(2), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, yzwy, Indexer(1), Indexer(2), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, yzwz, Indexer(1), Indexer(2), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, yzww, Indexer(1), Indexer(2), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, ywxx, Indexer(1), Indexer(3), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, ywxy, Indexer(1), Indexer(3), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, ywxz, Indexer(1), Indexer(3), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, ywxw, Indexer(1), Indexer(3), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, ywyx, Indexer(1), Indexer(3), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, ywyy, Indexer(1), Indexer(3), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, ywyz, Indexer(1), Indexer(3), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, ywyw, Indexer(1), Indexer(3), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, ywzx, Indexer(1), Indexer(3), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, ywzy, Indexer(1), Indexer(3), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, ywzz, Indexer(1), Indexer(3), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, ywzw, Indexer(1), Indexer(3), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, ywwx, Indexer(1), Indexer(3), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, ywwy, Indexer(1), Indexer(3), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, ywwz, Indexer(1), Indexer(3), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, ywww, Indexer(1), Indexer(3), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, zxxx, Indexer(2), Indexer(0), Indexer(0),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, zxxy, Indexer(2), Indexer(0), Indexer(0),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, zxxz, Indexer(2), Indexer(0), Indexer(0),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, zxxw, Indexer(2), Indexer(0), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, zxyx, Indexer(2), Indexer(0), Indexer(1),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, zxyy, Indexer(2), Indexer(0), Indexer(1),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, zxyz, Indexer(2), Indexer(0), Indexer(1),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, zxyw, Indexer(2), Indexer(0), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, zxzx, Indexer(2), Indexer(0), Indexer(2),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, zxzy, Indexer(2), Indexer(0), Indexer(2),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, zxzz, Indexer(2), Indexer(0), Indexer(2),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, zxzw, Indexer(2), Indexer(0), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, zxwx, Indexer(2), Indexer(0), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, zxwy, Indexer(2), Indexer(0), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, zxwz, Indexer(2), Indexer(0), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, zxww, Indexer(2), Indexer(0), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, zyxx, Indexer(2), Indexer(1), Indexer(0),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, zyxy, Indexer(2), Indexer(1), Indexer(0),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, zyxz, Indexer(2), Indexer(1), Indexer(0),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, zyxw, Indexer(2), Indexer(1), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, zyyx, Indexer(2), Indexer(1), Indexer(1),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, zyyy, Indexer(2), Indexer(1), Indexer(1),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, zyyz, Indexer(2), Indexer(1), Indexer(1),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, zyyw, Indexer(2), Indexer(1), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, zyzx, Indexer(2), Indexer(1), Indexer(2),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, zyzy, Indexer(2), Indexer(1), Indexer(2),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, zyzz, Indexer(2), Indexer(1), Indexer(2),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, zyzw, Indexer(2), Indexer(1), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, zywx, Indexer(2), Indexer(1), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, zywy, Indexer(2), Indexer(1), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, zywz, Indexer(2), Indexer(1), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, zyww, Indexer(2), Indexer(1), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, zzxx, Indexer(2), Indexer(2), Indexer(0),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, zzxy, Indexer(2), Indexer(2), Indexer(0),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, zzxz, Indexer(2), Indexer(2), Indexer(0),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, zzxw, Indexer(2), Indexer(2), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, zzyx, Indexer(2), Indexer(2), Indexer(1),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, zzyy, Indexer(2), Indexer(2), Indexer(1),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, zzyz, Indexer(2), Indexer(2), Indexer(1),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, zzyw, Indexer(2), Indexer(2), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 3 || N == 4, zzzx, Indexer(2), Indexer(2), Indexer(2),
+              Indexer(0))
+__SYCL_ACCESS(N == 3 || N == 4, zzzy, Indexer(2), Indexer(2), Indexer(2),
+              Indexer(1))
+__SYCL_ACCESS(N == 3 || N == 4, zzzz, Indexer(2), Indexer(2), Indexer(2),
+              Indexer(2))
+__SYCL_ACCESS(N == 4, zzzw, Indexer(2), Indexer(2), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, zzwx, Indexer(2), Indexer(2), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, zzwy, Indexer(2), Indexer(2), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, zzwz, Indexer(2), Indexer(2), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, zzww, Indexer(2), Indexer(2), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, zwxx, Indexer(2), Indexer(3), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, zwxy, Indexer(2), Indexer(3), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, zwxz, Indexer(2), Indexer(3), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, zwxw, Indexer(2), Indexer(3), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, zwyx, Indexer(2), Indexer(3), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, zwyy, Indexer(2), Indexer(3), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, zwyz, Indexer(2), Indexer(3), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, zwyw, Indexer(2), Indexer(3), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, zwzx, Indexer(2), Indexer(3), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, zwzy, Indexer(2), Indexer(3), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, zwzz, Indexer(2), Indexer(3), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, zwzw, Indexer(2), Indexer(3), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, zwwx, Indexer(2), Indexer(3), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, zwwy, Indexer(2), Indexer(3), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, zwwz, Indexer(2), Indexer(3), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, zwww, Indexer(2), Indexer(3), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, wxxx, Indexer(3), Indexer(0), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, wxxy, Indexer(3), Indexer(0), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, wxxz, Indexer(3), Indexer(0), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, wxxw, Indexer(3), Indexer(0), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, wxyx, Indexer(3), Indexer(0), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, wxyy, Indexer(3), Indexer(0), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, wxyz, Indexer(3), Indexer(0), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, wxyw, Indexer(3), Indexer(0), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, wxzx, Indexer(3), Indexer(0), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, wxzy, Indexer(3), Indexer(0), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, wxzz, Indexer(3), Indexer(0), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, wxzw, Indexer(3), Indexer(0), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, wxwx, Indexer(3), Indexer(0), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, wxwy, Indexer(3), Indexer(0), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, wxwz, Indexer(3), Indexer(0), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, wxww, Indexer(3), Indexer(0), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, wyxx, Indexer(3), Indexer(1), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, wyxy, Indexer(3), Indexer(1), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, wyxz, Indexer(3), Indexer(1), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, wyxw, Indexer(3), Indexer(1), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, wyyx, Indexer(3), Indexer(1), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, wyyy, Indexer(3), Indexer(1), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, wyyz, Indexer(3), Indexer(1), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, wyyw, Indexer(3), Indexer(1), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, wyzx, Indexer(3), Indexer(1), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, wyzy, Indexer(3), Indexer(1), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, wyzz, Indexer(3), Indexer(1), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, wyzw, Indexer(3), Indexer(1), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, wywx, Indexer(3), Indexer(1), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, wywy, Indexer(3), Indexer(1), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, wywz, Indexer(3), Indexer(1), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, wyww, Indexer(3), Indexer(1), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, wzxx, Indexer(3), Indexer(2), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, wzxy, Indexer(3), Indexer(2), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, wzxz, Indexer(3), Indexer(2), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, wzxw, Indexer(3), Indexer(2), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, wzyx, Indexer(3), Indexer(2), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, wzyy, Indexer(3), Indexer(2), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, wzyz, Indexer(3), Indexer(2), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, wzyw, Indexer(3), Indexer(2), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, wzzx, Indexer(3), Indexer(2), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, wzzy, Indexer(3), Indexer(2), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, wzzz, Indexer(3), Indexer(2), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, wzzw, Indexer(3), Indexer(2), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, wzwx, Indexer(3), Indexer(2), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, wzwy, Indexer(3), Indexer(2), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, wzwz, Indexer(3), Indexer(2), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, wzww, Indexer(3), Indexer(2), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, wwxx, Indexer(3), Indexer(3), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, wwxy, Indexer(3), Indexer(3), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, wwxz, Indexer(3), Indexer(3), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, wwxw, Indexer(3), Indexer(3), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, wwyx, Indexer(3), Indexer(3), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, wwyy, Indexer(3), Indexer(3), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, wwyz, Indexer(3), Indexer(3), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, wwyw, Indexer(3), Indexer(3), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, wwzx, Indexer(3), Indexer(3), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, wwzy, Indexer(3), Indexer(3), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, wwzz, Indexer(3), Indexer(3), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, wwzw, Indexer(3), Indexer(3), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, wwwx, Indexer(3), Indexer(3), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, wwwy, Indexer(3), Indexer(3), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, wwwz, Indexer(3), Indexer(3), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, wwww, Indexer(3), Indexer(3), Indexer(3), Indexer(3))
+
+//__swizzled_vec__ RGBA_SWIZZLE() const;
+__SYCL_ACCESS(N == 4, rr, Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, rg, Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, rb, Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, ra, Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, gr, Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, gg, Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, gb, Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, ga, Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, br, Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, bg, Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, bb, Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, ba, Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, ar, Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, ag, Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, ab, Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, aa, Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, rrr, Indexer(0), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, rrg, Indexer(0), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, rrb, Indexer(0), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, rra, Indexer(0), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, rgr, Indexer(0), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, rgg, Indexer(0), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, rgb, Indexer(0), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, rga, Indexer(0), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, rbr, Indexer(0), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, rbg, Indexer(0), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, rbb, Indexer(0), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, rba, Indexer(0), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, rar, Indexer(0), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, rag, Indexer(0), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, rab, Indexer(0), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, raa, Indexer(0), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, grr, Indexer(1), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, grg, Indexer(1), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, grb, Indexer(1), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, gra, Indexer(1), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, ggr, Indexer(1), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, ggg, Indexer(1), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, ggb, Indexer(1), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, gga, Indexer(1), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, gbr, Indexer(1), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, gbg, Indexer(1), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, gbb, Indexer(1), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, gba, Indexer(1), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, gar, Indexer(1), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, gag, Indexer(1), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, gab, Indexer(1), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, gaa, Indexer(1), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, brr, Indexer(2), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, brg, Indexer(2), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, brb, Indexer(2), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, bra, Indexer(2), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, bgr, Indexer(2), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, bgg, Indexer(2), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, bgb, Indexer(2), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, bga, Indexer(2), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, bbr, Indexer(2), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, bbg, Indexer(2), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, bbb, Indexer(2), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, bba, Indexer(2), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, bar, Indexer(2), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, bag, Indexer(2), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, bab, Indexer(2), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, baa, Indexer(2), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, arr, Indexer(3), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, arg, Indexer(3), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, arb, Indexer(3), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, ara, Indexer(3), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, agr, Indexer(3), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, agg, Indexer(3), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, agb, Indexer(3), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, aga, Indexer(3), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, abr, Indexer(3), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, abg, Indexer(3), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, abb, Indexer(3), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, aba, Indexer(3), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, aar, Indexer(3), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, aag, Indexer(3), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, aab, Indexer(3), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, aaa, Indexer(3), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, rrrr, Indexer(0), Indexer(0), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, rrrg, Indexer(0), Indexer(0), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, rrrb, Indexer(0), Indexer(0), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, rrra, Indexer(0), Indexer(0), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, rrgr, Indexer(0), Indexer(0), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, rrgg, Indexer(0), Indexer(0), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, rrgb, Indexer(0), Indexer(0), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, rrga, Indexer(0), Indexer(0), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, rrbr, Indexer(0), Indexer(0), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, rrbg, Indexer(0), Indexer(0), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, rrbb, Indexer(0), Indexer(0), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, rrba, Indexer(0), Indexer(0), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, rrar, Indexer(0), Indexer(0), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, rrag, Indexer(0), Indexer(0), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, rrab, Indexer(0), Indexer(0), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, rraa, Indexer(0), Indexer(0), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, rgrr, Indexer(0), Indexer(1), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, rgrg, Indexer(0), Indexer(1), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, rgrb, Indexer(0), Indexer(1), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, rgra, Indexer(0), Indexer(1), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, rggr, Indexer(0), Indexer(1), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, rggg, Indexer(0), Indexer(1), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, rggb, Indexer(0), Indexer(1), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, rgga, Indexer(0), Indexer(1), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, rgbr, Indexer(0), Indexer(1), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, rgbg, Indexer(0), Indexer(1), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, rgbb, Indexer(0), Indexer(1), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, rgba, Indexer(0), Indexer(1), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, rgar, Indexer(0), Indexer(1), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, rgag, Indexer(0), Indexer(1), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, rgab, Indexer(0), Indexer(1), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, rgaa, Indexer(0), Indexer(1), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, rbrr, Indexer(0), Indexer(2), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, rbrg, Indexer(0), Indexer(2), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, rbrb, Indexer(0), Indexer(2), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, rbra, Indexer(0), Indexer(2), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, rbgr, Indexer(0), Indexer(2), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, rbgg, Indexer(0), Indexer(2), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, rbgb, Indexer(0), Indexer(2), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, rbga, Indexer(0), Indexer(2), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, rbbr, Indexer(0), Indexer(2), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, rbbg, Indexer(0), Indexer(2), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, rbbb, Indexer(0), Indexer(2), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, rbba, Indexer(0), Indexer(2), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, rbar, Indexer(0), Indexer(2), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, rbag, Indexer(0), Indexer(2), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, rbab, Indexer(0), Indexer(2), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, rbaa, Indexer(0), Indexer(2), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, rarr, Indexer(0), Indexer(3), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, rarg, Indexer(0), Indexer(3), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, rarb, Indexer(0), Indexer(3), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, rara, Indexer(0), Indexer(3), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, ragr, Indexer(0), Indexer(3), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, ragg, Indexer(0), Indexer(3), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, ragb, Indexer(0), Indexer(3), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, raga, Indexer(0), Indexer(3), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, rabr, Indexer(0), Indexer(3), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, rabg, Indexer(0), Indexer(3), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, rabb, Indexer(0), Indexer(3), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, raba, Indexer(0), Indexer(3), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, raar, Indexer(0), Indexer(3), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, raag, Indexer(0), Indexer(3), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, raab, Indexer(0), Indexer(3), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, raaa, Indexer(0), Indexer(3), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, grrr, Indexer(1), Indexer(0), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, grrg, Indexer(1), Indexer(0), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, grrb, Indexer(1), Indexer(0), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, grra, Indexer(1), Indexer(0), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, grgr, Indexer(1), Indexer(0), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, grgg, Indexer(1), Indexer(0), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, grgb, Indexer(1), Indexer(0), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, grga, Indexer(1), Indexer(0), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, grbr, Indexer(1), Indexer(0), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, grbg, Indexer(1), Indexer(0), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, grbb, Indexer(1), Indexer(0), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, grba, Indexer(1), Indexer(0), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, grar, Indexer(1), Indexer(0), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, grag, Indexer(1), Indexer(0), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, grab, Indexer(1), Indexer(0), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, graa, Indexer(1), Indexer(0), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, ggrr, Indexer(1), Indexer(1), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, ggrg, Indexer(1), Indexer(1), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, ggrb, Indexer(1), Indexer(1), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, ggra, Indexer(1), Indexer(1), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, gggr, Indexer(1), Indexer(1), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, gggg, Indexer(1), Indexer(1), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, gggb, Indexer(1), Indexer(1), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, ggga, Indexer(1), Indexer(1), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, ggbr, Indexer(1), Indexer(1), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, ggbg, Indexer(1), Indexer(1), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, ggbb, Indexer(1), Indexer(1), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, ggba, Indexer(1), Indexer(1), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, ggar, Indexer(1), Indexer(1), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, ggag, Indexer(1), Indexer(1), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, ggab, Indexer(1), Indexer(1), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, ggaa, Indexer(1), Indexer(1), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, gbrr, Indexer(1), Indexer(2), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, gbrg, Indexer(1), Indexer(2), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, gbrb, Indexer(1), Indexer(2), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, gbra, Indexer(1), Indexer(2), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, gbgr, Indexer(1), Indexer(2), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, gbgg, Indexer(1), Indexer(2), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, gbgb, Indexer(1), Indexer(2), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, gbga, Indexer(1), Indexer(2), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, gbbr, Indexer(1), Indexer(2), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, gbbg, Indexer(1), Indexer(2), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, gbbb, Indexer(1), Indexer(2), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, gbba, Indexer(1), Indexer(2), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, gbar, Indexer(1), Indexer(2), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, gbag, Indexer(1), Indexer(2), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, gbab, Indexer(1), Indexer(2), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, gbaa, Indexer(1), Indexer(2), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, garr, Indexer(1), Indexer(3), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, garg, Indexer(1), Indexer(3), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, garb, Indexer(1), Indexer(3), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, gara, Indexer(1), Indexer(3), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, gagr, Indexer(1), Indexer(3), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, gagg, Indexer(1), Indexer(3), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, gagb, Indexer(1), Indexer(3), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, gaga, Indexer(1), Indexer(3), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, gabr, Indexer(1), Indexer(3), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, gabg, Indexer(1), Indexer(3), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, gabb, Indexer(1), Indexer(3), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, gaba, Indexer(1), Indexer(3), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, gaar, Indexer(1), Indexer(3), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, gaag, Indexer(1), Indexer(3), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, gaab, Indexer(1), Indexer(3), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, gaaa, Indexer(1), Indexer(3), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, brrr, Indexer(2), Indexer(0), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, brrg, Indexer(2), Indexer(0), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, brrb, Indexer(2), Indexer(0), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, brra, Indexer(2), Indexer(0), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, brgr, Indexer(2), Indexer(0), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, brgg, Indexer(2), Indexer(0), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, brgb, Indexer(2), Indexer(0), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, brga, Indexer(2), Indexer(0), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, brbr, Indexer(2), Indexer(0), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, brbg, Indexer(2), Indexer(0), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, brbb, Indexer(2), Indexer(0), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, brba, Indexer(2), Indexer(0), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, brar, Indexer(2), Indexer(0), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, brag, Indexer(2), Indexer(0), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, brab, Indexer(2), Indexer(0), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, braa, Indexer(2), Indexer(0), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, bgrr, Indexer(2), Indexer(1), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, bgrg, Indexer(2), Indexer(1), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, bgrb, Indexer(2), Indexer(1), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, bgra, Indexer(2), Indexer(1), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, bggr, Indexer(2), Indexer(1), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, bggg, Indexer(2), Indexer(1), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, bggb, Indexer(2), Indexer(1), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, bgga, Indexer(2), Indexer(1), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, bgbr, Indexer(2), Indexer(1), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, bgbg, Indexer(2), Indexer(1), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, bgbb, Indexer(2), Indexer(1), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, bgba, Indexer(2), Indexer(1), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, bgar, Indexer(2), Indexer(1), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, bgag, Indexer(2), Indexer(1), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, bgab, Indexer(2), Indexer(1), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, bgaa, Indexer(2), Indexer(1), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, bbrr, Indexer(2), Indexer(2), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, bbrg, Indexer(2), Indexer(2), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, bbrb, Indexer(2), Indexer(2), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, bbra, Indexer(2), Indexer(2), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, bbgr, Indexer(2), Indexer(2), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, bbgg, Indexer(2), Indexer(2), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, bbgb, Indexer(2), Indexer(2), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, bbga, Indexer(2), Indexer(2), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, bbbr, Indexer(2), Indexer(2), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, bbbg, Indexer(2), Indexer(2), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, bbbb, Indexer(2), Indexer(2), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, bbba, Indexer(2), Indexer(2), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, bbar, Indexer(2), Indexer(2), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, bbag, Indexer(2), Indexer(2), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, bbab, Indexer(2), Indexer(2), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, bbaa, Indexer(2), Indexer(2), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, barr, Indexer(2), Indexer(3), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, barg, Indexer(2), Indexer(3), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, barb, Indexer(2), Indexer(3), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, bara, Indexer(2), Indexer(3), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, bagr, Indexer(2), Indexer(3), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, bagg, Indexer(2), Indexer(3), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, bagb, Indexer(2), Indexer(3), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, baga, Indexer(2), Indexer(3), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, babr, Indexer(2), Indexer(3), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, babg, Indexer(2), Indexer(3), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, babb, Indexer(2), Indexer(3), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, baba, Indexer(2), Indexer(3), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, baar, Indexer(2), Indexer(3), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, baag, Indexer(2), Indexer(3), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, baab, Indexer(2), Indexer(3), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, baaa, Indexer(2), Indexer(3), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, arrr, Indexer(3), Indexer(0), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, arrg, Indexer(3), Indexer(0), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, arrb, Indexer(3), Indexer(0), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, arra, Indexer(3), Indexer(0), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, argr, Indexer(3), Indexer(0), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, argg, Indexer(3), Indexer(0), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, argb, Indexer(3), Indexer(0), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, arga, Indexer(3), Indexer(0), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, arbr, Indexer(3), Indexer(0), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, arbg, Indexer(3), Indexer(0), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, arbb, Indexer(3), Indexer(0), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, arba, Indexer(3), Indexer(0), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, arar, Indexer(3), Indexer(0), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, arag, Indexer(3), Indexer(0), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, arab, Indexer(3), Indexer(0), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, araa, Indexer(3), Indexer(0), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, agrr, Indexer(3), Indexer(1), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, agrg, Indexer(3), Indexer(1), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, agrb, Indexer(3), Indexer(1), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, agra, Indexer(3), Indexer(1), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, aggr, Indexer(3), Indexer(1), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, aggg, Indexer(3), Indexer(1), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, aggb, Indexer(3), Indexer(1), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, agga, Indexer(3), Indexer(1), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, agbr, Indexer(3), Indexer(1), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, agbg, Indexer(3), Indexer(1), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, agbb, Indexer(3), Indexer(1), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, agba, Indexer(3), Indexer(1), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, agar, Indexer(3), Indexer(1), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, agag, Indexer(3), Indexer(1), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, agab, Indexer(3), Indexer(1), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, agaa, Indexer(3), Indexer(1), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, abrr, Indexer(3), Indexer(2), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, abrg, Indexer(3), Indexer(2), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, abrb, Indexer(3), Indexer(2), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, abra, Indexer(3), Indexer(2), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, abgr, Indexer(3), Indexer(2), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, abgg, Indexer(3), Indexer(2), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, abgb, Indexer(3), Indexer(2), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, abga, Indexer(3), Indexer(2), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, abbr, Indexer(3), Indexer(2), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, abbg, Indexer(3), Indexer(2), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, abbb, Indexer(3), Indexer(2), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, abba, Indexer(3), Indexer(2), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, abar, Indexer(3), Indexer(2), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, abag, Indexer(3), Indexer(2), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, abab, Indexer(3), Indexer(2), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, abaa, Indexer(3), Indexer(2), Indexer(3), Indexer(3))
+__SYCL_ACCESS(N == 4, aarr, Indexer(3), Indexer(3), Indexer(0), Indexer(0))
+__SYCL_ACCESS(N == 4, aarg, Indexer(3), Indexer(3), Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, aarb, Indexer(3), Indexer(3), Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, aara, Indexer(3), Indexer(3), Indexer(0), Indexer(3))
+__SYCL_ACCESS(N == 4, aagr, Indexer(3), Indexer(3), Indexer(1), Indexer(0))
+__SYCL_ACCESS(N == 4, aagg, Indexer(3), Indexer(3), Indexer(1), Indexer(1))
+__SYCL_ACCESS(N == 4, aagb, Indexer(3), Indexer(3), Indexer(1), Indexer(2))
+__SYCL_ACCESS(N == 4, aaga, Indexer(3), Indexer(3), Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, aabr, Indexer(3), Indexer(3), Indexer(2), Indexer(0))
+__SYCL_ACCESS(N == 4, aabg, Indexer(3), Indexer(3), Indexer(2), Indexer(1))
+__SYCL_ACCESS(N == 4, aabb, Indexer(3), Indexer(3), Indexer(2), Indexer(2))
+__SYCL_ACCESS(N == 4, aaba, Indexer(3), Indexer(3), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, aaar, Indexer(3), Indexer(3), Indexer(3), Indexer(0))
+__SYCL_ACCESS(N == 4, aaag, Indexer(3), Indexer(3), Indexer(3), Indexer(1))
+__SYCL_ACCESS(N == 4, aaab, Indexer(3), Indexer(3), Indexer(3), Indexer(2))
+__SYCL_ACCESS(N == 4, aaaa, Indexer(3), Indexer(3), Indexer(3), Indexer(3))
+
+#endif // #ifdef SYCL_SIMPLE_SWIZZLES
+
+//__swizzled_vec__ lo()/hi() const;
+__SYCL_ACCESS(N == 2, lo, Indexer(0))
+__SYCL_ACCESS(N == 3, lo, Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 4, lo, Indexer(0), Indexer(1))
+__SYCL_ACCESS(N == 8, lo, Indexer(0), Indexer(1), Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 16, lo, Indexer(0), Indexer(1), Indexer(2), Indexer(3),
+              Indexer(4), Indexer(5), Indexer(6), Indexer(7))
+__SYCL_ACCESS(N == 2, hi, Indexer(1))
+__SYCL_ACCESS(N == 3, hi, Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 4, hi, Indexer(2), Indexer(3))
+__SYCL_ACCESS(N == 8, hi, Indexer(4), Indexer(5), Indexer(6), Indexer(7))
+__SYCL_ACCESS(N == 16, hi, Indexer(8), Indexer(9), Indexer(10), Indexer(11),
+              Indexer(12), Indexer(13), Indexer(14), Indexer(15))
+//__swizzled_vec__ odd()/even() const;
+__SYCL_ACCESS(N == 2, odd, Indexer(1))
+__SYCL_ACCESS(N == 3, odd, Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 4, odd, Indexer(1), Indexer(3))
+__SYCL_ACCESS(N == 8, odd, Indexer(1), Indexer(3), Indexer(5), Indexer(7))
+__SYCL_ACCESS(N == 16, odd, Indexer(1), Indexer(3), Indexer(5), Indexer(7),
+              Indexer(9), Indexer(11), Indexer(13), Indexer(15))
+__SYCL_ACCESS(N == 2, even, Indexer(0))
+__SYCL_ACCESS(N == 3, even, Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 4, even, Indexer(0), Indexer(2))
+__SYCL_ACCESS(N == 8, even, Indexer(0), Indexer(2), Indexer(4), Indexer(6))
+__SYCL_ACCESS(N == 16, even, Indexer(0), Indexer(2), Indexer(4), Indexer(6),
+              Indexer(8), Indexer(10), Indexer(12), Indexer(14))
+#undef __SYCL_ACCESS
diff --git a/sycl/include/CL/sycl/types.hpp b/sycl/include/CL/sycl/types.hpp
new file mode 100644
index 000000000000..608f9537f578
--- /dev/null
+++ b/sycl/include/CL/sycl/types.hpp
@@ -0,0 +1,1546 @@
+//==---------------- types.hpp --- SYCL types ------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/common.hpp>
+
+#ifndef __SYCL_DEVICE_ONLY__
+#include <algorithm>
+#include <functional>
+#endif // __SYCL_DEVICE_ONLY__
+// 4.10.1: Scalar data types
+// 4.10.2: SYCL vector types
+
+namespace cl {
+namespace sycl {
+
+enum class rounding_mode { automatic, rte, rtz, rtp, rtn };
+struct elem {
+  static constexpr int x = 0;
+  static constexpr int y = 1;
+  static constexpr int z = 2;
+  static constexpr int w = 3;
+  static constexpr int r = 0;
+  static constexpr int g = 1;
+  static constexpr int b = 2;
+  static constexpr int a = 3;
+  static constexpr int s0 = 0;
+  static constexpr int s1 = 1;
+  static constexpr int s2 = 2;
+  static constexpr int s3 = 3;
+  static constexpr int s4 = 4;
+  static constexpr int s5 = 5;
+  static constexpr int s6 = 6;
+  static constexpr int s7 = 7;
+  static constexpr int s8 = 8;
+  static constexpr int s9 = 9;
+  static constexpr int sA = 10;
+  static constexpr int sB = 11;
+  static constexpr int sC = 12;
+  static constexpr int sD = 13;
+  static constexpr int sE = 14;
+  static constexpr int sF = 15;
+};
+
+/**
+ * A signed 8-bit integer.
+ */
+typedef signed char schar;
+
+/**
+ * An unsigned 8-bit integer.
+ */
+typedef unsigned char uchar;
+
+/**
+ * An unsigned 16-bit integer.
+ */
+typedef unsigned short ushort;
+
+/**
+ * An unsigned 32-bit integer.
+ */
+typedef unsigned int uint;
+
+/**
+ * An unsigned 64-bit integer.
+ */
+typedef unsigned long ulong;
+
+/**
+ * An signed integer with width of at least 64-bit.
+ */
+typedef long long longlong;
+
+/**
+ * An unsigned integer with width of at least 64-bit.
+ */
+typedef unsigned long long ulonglong;
+
+namespace detail {
+
+template <typename VecT, typename OperationLeftT, typename OperationRightT,
+          template <typename> class OperationCurrentT, int... Indexes>
+class SwizzleOp;
+
+template <typename T, int N> class BaseCLTypeConverter;
+
+// Element type for relational operator return value.
+template <typename DataT>
+using rel_t = typename std::conditional<
+    sizeof(DataT) == sizeof(cl_char), cl_char,
+    typename std::conditional<
+        sizeof(DataT) == sizeof(cl_short), cl_short,
+        typename std::conditional<
+            sizeof(DataT) == sizeof(cl_int), cl_int,
+            typename std::conditional<sizeof(DataT) == sizeof(cl_long), cl_long,
+                                      bool>::type>::type>::type>::type;
+
+// Special type indicating that SwizzleOp should just read value from vector -
+// not trying to perform any operations. Should not be called.
+template <typename DataT> class GetOp {
+public:
+  DataT getValue(size_t Index) const;
+  DataT operator()(DataT LHS, DataT Rhs);
+};
+
+// Special type for working SwizzleOp with scalars, stores a scalar and gives
+// the scalar at any index. Provides interface is compatible with SwizzleOp
+// operations
+template <typename DataT> class GetScalarOp {
+public:
+  GetScalarOp(DataT Data) : m_Data(Data) {}
+  DataT getValue(size_t Index) const { return m_Data; }
+
+private:
+  DataT m_Data;
+};
+
+template <typename T> struct EqualTo {
+  constexpr rel_t<T> operator()(const T &Lhs, const T &Rhs) const {
+    return (Lhs == Rhs) ? -1 : 0;
+  }
+};
+
+template <typename T> struct NotEqualTo {
+  constexpr rel_t<T> operator()(const T &Lhs, const T &Rhs) const {
+    return (Lhs != Rhs) ? -1 : 0;
+  }
+};
+
+template <typename T> struct GreaterEqualTo {
+  constexpr rel_t<T> operator()(const T &Lhs, const T &Rhs) const {
+    return (Lhs >= Rhs) ? -1 : 0;
+  }
+};
+
+template <typename T> struct LessEqualTo {
+  constexpr rel_t<T> operator()(const T &Lhs, const T &Rhs) const {
+    return (Lhs <= Rhs) ? -1 : 0;
+  }
+};
+
+template <typename T> struct GreaterThan {
+  constexpr rel_t<T> operator()(const T &Lhs, const T &Rhs) const {
+    return (Lhs > Rhs) ? -1 : 0;
+  }
+};
+
+template <typename T> struct LessThan {
+  constexpr rel_t<T> operator()(const T &Lhs, const T &Rhs) const {
+    return (Lhs < Rhs) ? -1 : 0;
+  }
+};
+
+template <typename T> struct LogicalAnd {
+  constexpr rel_t<T> operator()(const T &Lhs, const T &Rhs) const {
+    return (Lhs && Rhs) ? -1 : 0;
+  }
+};
+
+template <typename T> struct LogicalOr {
+  constexpr rel_t<T> operator()(const T &Lhs, const T &Rhs) const {
+    return (Lhs || Rhs) ? -1 : 0;
+  }
+};
+
+template <typename T> struct RShift {
+  constexpr T operator()(const T &Lhs, const T &Rhs) const {
+    return Lhs >> Rhs;
+  }
+};
+
+template <typename T> struct LShift {
+  constexpr T operator()(const T &Lhs, const T &Rhs) const {
+    return Lhs << Rhs;
+  }
+};
+
+} // namespace detail
+
+template <typename DataT, int NumElements> class vec {
+  // This represent type of underlying value. There should be only one field
+  // in the class, so vec<float, 16> should be equal to float16 in memory.
+  using DataType =
+      typename detail::BaseCLTypeConverter<DataT, NumElements>::DataType;
+
+  template <bool B, class T, class F>
+  using conditional_t = typename std::conditional<B, T, F>::type;
+
+  static constexpr int getNumElements() { return NumElements; }
+
+  // SizeChecker is needed for vec(const argTN &... args) ctor to validate args.
+  template <int Counter, int MaxValue, class...>
+  struct SizeChecker
+      : conditional_t<Counter == MaxValue, std::true_type, std::false_type> {};
+
+  template <int Counter, int MaxValue, typename DataT_, class... tail>
+  struct SizeChecker<Counter, MaxValue, DataT_, tail...>
+      : conditional_t<Counter + 1 <= MaxValue,
+                      SizeChecker<Counter + 1, MaxValue, tail...>,
+                      std::false_type> {};
+
+#define ALLOW_VECTOR_SIZES(num_elements)                                       \
+  template <int Counter, int MaxValue, typename DataT_, class... tail>         \
+  struct SizeChecker<Counter, MaxValue, vec<DataT_, num_elements>, tail...>    \
+      : conditional_t<Counter + num_elements <= MaxValue,                      \
+                      SizeChecker<Counter + num_elements, MaxValue, tail...>,  \
+                      std::false_type> {};                                     \
+  template <int Counter, int MaxValue, typename DataT_, typename T2,           \
+            typename T3, template <typename> class T4, int... T5,              \
+            class... tail>                                                     \
+  struct SizeChecker<                                                          \
+      Counter, MaxValue,                                                       \
+      detail::SwizzleOp<vec<DataT_, num_elements>, T2, T3, T4, T5...>,         \
+      tail...>                                                                 \
+      : conditional_t<Counter + sizeof...(T5) <= MaxValue,                     \
+                      SizeChecker<Counter + sizeof...(T5), MaxValue, tail...>, \
+                      std::false_type> {};                                     \
+  template <int Counter, int MaxValue, typename DataT_, typename T2,           \
+            typename T3, template <typename> class T4, int... T5,              \
+            class... tail>                                                     \
+  struct SizeChecker<                                                          \
+      Counter, MaxValue,                                                       \
+      detail::SwizzleOp<const vec<DataT_, num_elements>, T2, T3, T4, T5...>,   \
+      tail...>                                                                 \
+      : conditional_t<Counter + sizeof...(T5) <= MaxValue,                     \
+                      SizeChecker<Counter + sizeof...(T5), MaxValue, tail...>, \
+                      std::false_type> {};
+
+  ALLOW_VECTOR_SIZES(1)
+  ALLOW_VECTOR_SIZES(2)
+  ALLOW_VECTOR_SIZES(3)
+  ALLOW_VECTOR_SIZES(4)
+  ALLOW_VECTOR_SIZES(8)
+  ALLOW_VECTOR_SIZES(16)
+#undef ALLOW_VECTOR_SIZES
+
+  template <class...> struct conjunction : std::true_type {};
+  template <class B1, class... tail>
+  struct conjunction<B1, tail...>
+      : conditional_t<bool(B1::value), conjunction<tail...>, B1> {};
+
+  // TypeChecker is needed for vec(const argTN &... args) ctor to validate args.
+  template <typename T, typename DataT_>
+  struct TypeChecker : std::is_convertible<T, DataT_> {};
+#define ALLOW_VECTOR_TYPES(num_elements)                                       \
+  template <typename DataT_>                                                   \
+  struct TypeChecker<vec<DataT_, num_elements>, DataT_> : std::true_type {};   \
+  template <typename DataT_, typename T2, typename T3,                         \
+            template <typename> class T4, int... T5>                           \
+  struct TypeChecker<                                                          \
+      detail::SwizzleOp<vec<DataT_, num_elements>, T2, T3, T4, T5...>, DataT_> \
+      : std::true_type {};                                                     \
+  template <typename DataT_, typename T2, typename T3,                         \
+            template <typename> class T4, int... T5>                           \
+  struct TypeChecker<                                                          \
+      detail::SwizzleOp<const vec<DataT_, num_elements>, T2, T3, T4, T5...>,   \
+      DataT_> : std::true_type {};
+
+  ALLOW_VECTOR_TYPES(1)
+  ALLOW_VECTOR_TYPES(2)
+  ALLOW_VECTOR_TYPES(3)
+  ALLOW_VECTOR_TYPES(4)
+  ALLOW_VECTOR_TYPES(8)
+  ALLOW_VECTOR_TYPES(16)
+#undef ALLOW_VECTOR_TYPES
+
+  template <int... Indexes>
+  using Swizzle =
+      detail::SwizzleOp<vec, detail::GetOp<DataT>, detail::GetOp<DataT>,
+                        detail::GetOp, Indexes...>;
+
+  template <int... Indexes>
+  using ConstSwizzle =
+      detail::SwizzleOp<const vec, detail::GetOp<DataT>, detail::GetOp<DataT>,
+                        detail::GetOp, Indexes...>;
+
+  // Shortcuts for args validation in vec(const argTN &... args) ctor.
+  template <typename... argTN>
+  using EnableIfSuitableTypes = typename std::enable_if<
+      conjunction<TypeChecker<argTN, DataT>...>::value>::type;
+
+  template <typename... argTN>
+  using EnableIfSuitableNumElements = typename std::enable_if<
+      SizeChecker<0, NumElements, argTN...>::value>::type;
+
+public:
+  using element_type = DataT;
+  using rel_t = detail::rel_t<DataT>;
+
+#ifdef __SYCL_DEVICE_ONLY__
+  using vector_t = DataType;
+#endif
+
+  vec() { m_Data = {0}; }
+
+  vec(const vec &Rhs) : m_Data(Rhs.m_Data) {}
+
+  vec(vec &&Rhs) : m_Data(std::move(Rhs.m_Data)) {}
+
+  vec &operator=(const vec &Rhs) {
+    m_Data = Rhs.m_Data;
+    return *this;
+  }
+
+  // W/o this, things like "vec<char,*> = vec<signed char, *>" doesn't work.
+  template <typename Ty = DataT>
+  typename std::enable_if<!std::is_same<Ty, rel_t>::value &&
+                              std::is_convertible<Ty, rel_t>::value,
+                          vec &>::type
+  operator=(const vec<rel_t, NumElements> &Rhs) {
+    *this = Rhs.template as<vec>();
+    return *this;
+  }
+
+  explicit vec(const DataT &arg) {
+    for (int i = 0; i < NumElements; ++i) {
+      setValue(i, arg);
+    }
+  }
+
+  // Constructor from values of base type or vec of base type. Checks that
+  // base types are match and that the NumElements == sum of lenghts of args.
+  template <typename... argTN, typename = EnableIfSuitableTypes<argTN...>,
+            typename = EnableIfSuitableNumElements<argTN...>>
+  vec(const argTN &... args) {
+    vaargCtorHelper(0, args...);
+  }
+
+  // TODO: Remove, for debug purposes only.
+  void dump() {
+#ifndef __SYCL_DEVICE_ONLY__
+    for (int I = 0; I < NumElements; ++I) {
+      std::cout << "  " << I << ": " << m_Data.s[I] << std::endl;
+    }
+    std::cout << std::endl;
+#endif // __SYCL_DEVICE_ONLY__
+  }
+
+#ifdef __SYCL_DEVICE_ONLY__
+
+  template <typename vector_t_ = vector_t,
+            typename = typename std::enable_if<
+                std::is_same<vector_t_, vector_t>::value &&
+                !std::is_same<vector_t_, DataT>::value>::type>
+  vec(vector_t openclVector) : m_Data(openclVector) {}
+  operator vector_t() const { return m_Data; }
+#endif
+  // Available only when: NumElements == 1
+  template <int N = NumElements>
+  operator typename std::enable_if<N == 1, DataT>::type() const {
+    return m_Data;
+  }
+  size_t get_count() const { return NumElements; }
+  size_t get_size() const { return sizeof(m_Data); }
+
+  // TODO: convert() for FP types. Also, check whether rounding mode handling
+  // is needed for integers to FP convert.
+  // template <typename convertT, rounding_mode roundingMode>
+  // vec<convertT, NumElements> convert() const;
+  template <typename convertT, rounding_mode roundingMode>
+  typename std::enable_if<std::is_integral<DataT>::value,
+                          vec<convertT, NumElements>>::type
+  convert() const {
+    vec<convertT, NumElements> Result;
+    for (size_t I = 0; I < NumElements; ++I) {
+      Result.setValue(I, static_cast<convertT>(getValue(I)));
+    }
+    return Result;
+  }
+
+  template <typename asT>
+  typename std::enable_if<sizeof(asT) == sizeof(DataType), asT>::type
+  as() const {
+    asT Result;
+    *static_cast<DataType *>(static_cast<void *>(&Result.m_Data)) = m_Data;
+    return Result;
+  }
+
+  template <int... SwizzleIndexes> Swizzle<SwizzleIndexes...> swizzle() {
+    return this;
+  }
+
+  template <int... SwizzleIndexes>
+  ConstSwizzle<SwizzleIndexes...> swizzle() const {
+    return this;
+  }
+
+  // Begin hi/lo, even/odd, xyzw, and rgba swizzles.
+private:
+  // Indexer used in the swizzles.def
+  static constexpr int Indexer(int index) { return index; }
+
+public:
+#ifdef __SYCL_ACCESS_RETURN
+#error "Undefine __SYCL_ACCESS_RETURN macro"
+#endif
+#define __SYCL_ACCESS_RETURN this
+#include "swizzles.def"
+#undef __SYCL_ACCESS_RETURN
+  // End of hi/lo, even/odd, xyzw, and rgba swizzles.
+
+  // TODO: make templated address space to work.
+  // Somehow, access<> to multi_ptr<> conversion doesn't work w/o making
+  // address space explicitly specified.
+#ifdef __SYCL_LOADSTORE
+#error "Undefine __SYCL_LOADSTORE macro"
+#endif
+#define __SYCL_LOADSTORE(Space)                                                \
+  void load(size_t Offset, multi_ptr<DataT, Space> Ptr) {                      \
+    m_Data = *multi_ptr<DataType, Space>(static_cast<DataType *>(              \
+        static_cast<void *>(Ptr + Offset * NumElements)));                     \
+  }                                                                            \
+  void store(size_t Offset, multi_ptr<DataT, Space> Ptr) const {               \
+    *multi_ptr<DataType, Space>(static_cast<DataType *>(                       \
+        static_cast<void *>(Ptr + Offset * NumElements))) = m_Data;            \
+  }
+
+  __SYCL_LOADSTORE(access::address_space::global_space)
+  __SYCL_LOADSTORE(access::address_space::local_space)
+  __SYCL_LOADSTORE(access::address_space::constant_space)
+  __SYCL_LOADSTORE(access::address_space::private_space)
+#undef __SYCL_LOADSTORE
+
+#ifdef __SYCL_BINOP
+#error "Undefine __SYCL_BINOP macro"
+#endif
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define __SYCL_BINOP(BINOP, OPASSIGN)                                          \
+  vec operator BINOP(const vec &Rhs) const {                                   \
+    vec Ret;                                                                   \
+    Ret.m_Data = m_Data BINOP Rhs.m_Data;                                      \
+    return Ret;                                                                \
+  }                                                                            \
+  template <typename T>                                                        \
+  typename std::enable_if<std::is_convertible<T, DataT>::value &&              \
+                              std::is_fundamental<T>::value,                   \
+                          vec>::type                                           \
+  operator BINOP(const T &Rhs) const {                                         \
+    return *this BINOP vec(static_cast<const DataT &>(Rhs));                   \
+  }                                                                            \
+  vec &operator OPASSIGN(const vec &Rhs) {                                     \
+    *this = *this BINOP Rhs;                                                   \
+    return *this;                                                              \
+  }                                                                            \
+  template <int Num = NumElements>                                             \
+  typename std::enable_if<Num != 1, vec &>::type operator OPASSIGN(            \
+      const DataT &Rhs) {                                                      \
+    *this = *this BINOP vec(Rhs);                                              \
+    return *this;                                                              \
+  }
+#else // __SYCL_DEVICE_ONLY__
+#define __SYCL_BINOP(BINOP, OPASSIGN)                                          \
+  vec operator BINOP(const vec &Rhs) const {                                   \
+    vec Ret;                                                                   \
+    for (size_t I = 0; I < NumElements; ++I) {                                 \
+      Ret.setValue(I, (getValue(I) BINOP Rhs.getValue(I)));                    \
+    }                                                                          \
+    return Ret;                                                                \
+  }                                                                            \
+  template <typename T>                                                        \
+  typename std::enable_if<std::is_convertible<T, DataT>::value &&              \
+                              std::is_fundamental<T>::value,                   \
+                          vec>::type                                           \
+  operator BINOP(const T &Rhs) const {                                         \
+    return *this BINOP vec(static_cast<const DataT &>(Rhs));                   \
+  }                                                                            \
+  vec &operator OPASSIGN(const vec &Rhs) {                                     \
+    *this = *this BINOP Rhs;                                                   \
+    return *this;                                                              \
+  }                                                                            \
+  template <int Num = NumElements>                                             \
+  typename std::enable_if<Num != 1, vec &>::type operator OPASSIGN(            \
+      const DataT &Rhs) {                                                      \
+    *this = *this BINOP vec(Rhs);                                              \
+    return *this;                                                              \
+  }
+#endif // __SYCL_DEVICE_ONLY__
+
+  __SYCL_BINOP(+, +=)
+  __SYCL_BINOP(-, -=)
+  __SYCL_BINOP(*, *=)
+  __SYCL_BINOP(/, /=)
+
+  // TODO: The following OPs are available only when: DataT != cl_float &&
+  // DataT != cl_double && DataT != cl_half
+  __SYCL_BINOP(%, %=)
+  __SYCL_BINOP(|, |=)
+  __SYCL_BINOP(&, &=)
+  __SYCL_BINOP(^, ^=)
+  __SYCL_BINOP(>>, >>=)
+  __SYCL_BINOP(<<, <<=)
+#undef __SYCL_BINOP
+#undef __SYCL_BINOP_HELP
+
+  // Note: vec<>/SwizzleOp logical value is 0/-1 logic, as opposed to 0/1 logic.
+  // As far as CTS validation is concerned, 0/-1 logic also applies when
+  // NumElements is equal to one, which is somewhat inconsistent with being
+  // tranparent with scalar data.
+  //
+  // TODO, at least for the device: Use direct comparison on aggregate data,
+  // e.g., Ret.m_Data = m_Data RELLOGOP Rhs.m_Data, as opposed to looping
+  // around scalar operations.
+#ifdef __SYCL_RELLOGOP
+#error "Undefine __SYCL_RELLOGOP macro"
+#endif
+#define __SYCL_RELLOGOP(RELLOGOP)                                              \
+  vec<rel_t, NumElements> operator RELLOGOP(const vec &Rhs) const {            \
+    vec<rel_t, NumElements> Ret;                                               \
+    for (size_t I = 0; I < NumElements; ++I) {                                 \
+      Ret.setValue(I, -(getValue(I) RELLOGOP Rhs.getValue(I)));                \
+    }                                                                          \
+    return Ret;                                                                \
+  }                                                                            \
+  template <typename T>                                                        \
+  typename std::enable_if<std::is_convertible<T, DataT>::value &&              \
+                              std::is_fundamental<T>::value,                   \
+                          vec<rel_t, NumElements>>::type                       \
+  operator RELLOGOP(const T &Rhs) const {                                      \
+    return *this RELLOGOP vec(static_cast<const DataT &>(Rhs));                \
+  }
+
+  __SYCL_RELLOGOP(==)
+  __SYCL_RELLOGOP(!=)
+  __SYCL_RELLOGOP(>)
+  __SYCL_RELLOGOP(<)
+  __SYCL_RELLOGOP(>=)
+  __SYCL_RELLOGOP(<=)
+  // TODO: limit to integral types.
+  __SYCL_RELLOGOP(&&)
+  __SYCL_RELLOGOP(||)
+#undef __SYCL_RELLOGOP
+
+#ifdef __SYCL_UOP
+#error "Undefine __SYCL_UOP macro"
+#endif
+#define __SYCL_UOP(UOP, OPASSIGN)                                              \
+  vec &operator UOP() {                                                        \
+    *this OPASSIGN 1;                                                          \
+    return *this;                                                              \
+  }                                                                            \
+  vec operator UOP(int) {                                                      \
+    vec Ret(*this);                                                            \
+    *this OPASSIGN 1;                                                          \
+    return Ret;                                                                \
+  }
+
+  __SYCL_UOP(++, +=)
+  __SYCL_UOP(--, -=)
+#undef __SYCL_UOP
+
+  template <typename T = DataT>
+  typename std::enable_if<std::is_integral<T>::value, vec>::type
+  operator~() const {
+    vec Ret;
+    for (size_t I = 0; I < NumElements; ++I) {
+      Ret.setValue(I, ~getValue(I));
+    }
+    return Ret;
+  }
+
+  vec<rel_t, NumElements> operator!() const {
+    vec<rel_t, NumElements> Ret;
+    for (size_t I = 0; I < NumElements; ++I) {
+      Ret.setValue(I, !getValue(I));
+    }
+    return Ret;
+  }
+
+  // OP is: &&, ||
+  // vec<RET, NumElements> operatorOP(const vec<DataT, NumElements> &Rhs) const;
+  // vec<RET, NumElements> operatorOP(const DataT &Rhs) const;
+
+  // OP is: ==, !=, <, >, <=, >=
+  // vec<RET, NumElements> operatorOP(const vec<DataT, NumElements> &Rhs) const;
+  // vec<RET, NumElements> operatorOP(const DataT &Rhs) const;
+private:
+  // Generic method that execute "Operation" on underlying values.
+  template <template <typename> class Operation>
+  vec<DataT, NumElements>
+  operatorHelper(const vec<DataT, NumElements> &Rhs) const {
+    vec<DataT, NumElements> Result;
+#ifdef __SYCL_DEVICE_ONLY__
+    Operation<DataType> Op;
+    Result.m_Data = Op(m_Data, Rhs.m_Data);
+#else  // __SYCL_DEVICE_ONLY__
+    Operation<DataT> Op;
+    for (size_t I = 0; I < NumElements; ++I) {
+      Result.setValue(I, Op(Rhs.getValue(I), getValue(I)));
+    }
+#endif // __SYCL_DEVICE_ONLY__
+    return Result;
+  }
+
+// setValue and getValue should be able to operate on different underlying
+// types: enum cl_float#N , builtin vector float#N, builtin type float.
+#ifdef __SYCL_DEVICE_ONLY__
+  template <int Num = NumElements,
+            typename = typename std::enable_if<1 != Num>::type>
+  void setValue(int Index, const DataT &Value, int) {
+    m_Data[Index] = Value;
+  }
+
+  template <int Num = NumElements,
+            typename = typename std::enable_if<1 != Num>::type>
+  DataT getValue(int Index, int) const {
+    return m_Data[Index];
+  }
+#else
+  template <int Num = NumElements,
+            typename = typename std::enable_if<1 != Num>::type>
+  void setValue(int Index, const DataT &Value, int) {
+    m_Data.s[Index] = Value;
+  }
+
+  template <int Num = NumElements,
+            typename = typename std::enable_if<1 != Num>::type>
+  DataT getValue(int Index, int) const {
+    return m_Data.s[Index];
+  }
+#endif
+
+  template <int Num = NumElements,
+            typename = typename std::enable_if<1 == Num>::type>
+  void setValue(int Index, const DataT &Value, float) {
+    m_Data = Value;
+  }
+
+  template <int Num = NumElements,
+            typename = typename std::enable_if<1 == Num>::type>
+  DataT getValue(int Index, float) const {
+    return m_Data;
+  }
+
+  // Special proxies as specialization is not allowed in class scope.
+  void setValue(int Index, const DataT &Value) {
+    if (NumElements == 1) {
+      setValue(Index, Value, (int)0);
+    } else {
+      setValue(Index, Value, (float)0);
+    }
+  }
+
+  DataT getValue(int Index) const {
+    if (NumElements == 1) {
+      return getValue(Index, (int)0);
+    } else {
+      return getValue(Index, (float)0);
+    }
+  }
+
+  // Helpers for variadic template constructor of vec.
+  template <typename T, typename... argTN>
+  int vaargCtorHelper(int Idx, const T &arg) {
+    setValue(Idx, arg);
+    return Idx + 1;
+  }
+
+  template <typename DataT_, int NumElements_>
+  int vaargCtorHelper(int Idx, const vec<DataT_, NumElements_> &arg) {
+    for (size_t I = 0; I < NumElements_; ++I) {
+      setValue(Idx + I, arg.getValue(I));
+    }
+    return Idx + NumElements_;
+  }
+
+  template <typename DataT_, int NumElements_, typename T2, typename T3,
+            template <typename> class T4, int... T5>
+  int vaargCtorHelper(int Idx,
+                      const detail::SwizzleOp<vec<DataT_, NumElements_>, T2, T3,
+                                              T4, T5...> &arg) {
+    size_t NumElems = sizeof...(T5);
+    for (size_t I = 0; I < NumElems; ++I) {
+      setValue(Idx + I, arg.getValue(I));
+    }
+    return Idx + NumElems;
+  }
+
+  template <typename DataT_, int NumElements_, typename T2, typename T3,
+            template <typename> class T4, int... T5>
+  int vaargCtorHelper(int Idx,
+                      const detail::SwizzleOp<const vec<DataT_, NumElements_>,
+                                              T2, T3, T4, T5...> &arg) {
+    size_t NumElems = sizeof...(T5);
+    for (size_t I = 0; I < NumElems; ++I) {
+      setValue(Idx + I, arg.getValue(I));
+    }
+    return Idx + NumElems;
+  }
+
+  template <typename T1, typename... argTN>
+  void vaargCtorHelper(int Idx, const T1 &arg, const argTN &... args) {
+    int NewIdx = vaargCtorHelper(Idx, arg);
+    vaargCtorHelper(NewIdx, args...);
+  }
+
+  template <typename DataT_, int NumElements_, typename... argTN>
+  void vaargCtorHelper(int Idx, const vec<DataT_, NumElements_> &arg,
+                       const argTN &... args) {
+    int NewIdx = vaargCtorHelper(Idx, arg);
+    vaargCtorHelper(NewIdx, args...);
+  }
+
+  // fields
+  DataType m_Data;
+
+  // friends
+  template <typename T1, typename T2, typename T3, template <typename> class T4,
+            int... T5>
+  friend class detail::SwizzleOp;
+  template <typename T1, int T2> friend class vec;
+};
+
+namespace detail {
+
+// SwizzleOP represents expression templates that operate on vec.
+// Actual computation performed on conversion or assignment operators.
+template <typename VecT, typename OperationLeftT, typename OperationRightT,
+          template <typename> class OperationCurrentT, int... Indexes>
+class SwizzleOp {
+  using DataT = typename VecT::element_type;
+  using rel_t = detail::rel_t<DataT>;
+  static constexpr int getNumElements() { return sizeof...(Indexes); }
+
+  template <typename OperationRightT_,
+            template <typename> class OperationCurrentT_, int... Idx_>
+  using NewLHOp = SwizzleOp<VecT,
+                            SwizzleOp<VecT, OperationLeftT, OperationRightT,
+                                      OperationCurrentT, Indexes...>,
+                            OperationRightT_, OperationCurrentT_, Idx_...>;
+
+  template <typename OperationRightT_,
+            template <typename> class OperationCurrentT_, int... Idx_>
+  using NewRelOp = SwizzleOp<vec<rel_t, VecT::getNumElements()>,
+                             SwizzleOp<VecT, OperationLeftT, OperationRightT,
+                                       OperationCurrentT, Indexes...>,
+                             OperationRightT_, OperationCurrentT_, Idx_...>;
+
+  template <typename OperationLeftT_,
+            template <typename> class OperationCurrentT_, int... Idx_>
+  using NewRHOp = SwizzleOp<VecT, OperationLeftT_,
+                            SwizzleOp<VecT, OperationLeftT, OperationRightT,
+                                      OperationCurrentT, Indexes...>,
+                            OperationCurrentT_, Idx_...>;
+
+  template <int IdxNum>
+  using EnableIfOneIndex =
+      typename std::enable_if<1 == IdxNum &&
+                              SwizzleOp::getNumElements() == IdxNum>::type;
+
+  template <int IdxNum>
+  using EnableIfMultipleIndexes =
+      typename std::enable_if<1 != IdxNum &&
+                              SwizzleOp::getNumElements() == IdxNum>::type;
+
+  template <typename T>
+  using EnableIfScalarType =
+      typename std::enable_if<std::is_convertible<DataT, T>::value &&
+                              std::is_fundamental<T>::value>::type;
+
+  template <typename T>
+  using EnableIfNoScalarType =
+      typename std::enable_if<!std::is_convertible<DataT, T>::value ||
+                              !std::is_fundamental<T>::value>::type;
+
+  template <int... Indices>
+  using Swizzle =
+      SwizzleOp<VecT, GetOp<DataT>, GetOp<DataT>, GetOp, Indices...>;
+
+  template <int... Indices>
+  using ConstSwizzle =
+      SwizzleOp<const VecT, GetOp<DataT>, GetOp<DataT>, GetOp, Indices...>;
+
+public:
+  size_t get_count() const { return getNumElements(); }
+  template <int Num = getNumElements()> size_t get_size() const {
+    return sizeof(DataT) * (Num == 3 ? 4 : Num);
+  }
+
+  template <typename T, int IdxNum = getNumElements(),
+            typename = EnableIfOneIndex<IdxNum>,
+            typename = EnableIfScalarType<T>>
+  operator T() const {
+    return getValue(0);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  friend NewRHOp<GetScalarOp<T>, std::multiplies, Indexes...>
+  operator*(const T &Lhs, const SwizzleOp &Rhs) {
+    return NewRHOp<GetScalarOp<T>, std::multiplies, Indexes...>(
+        Rhs.m_Vector, GetScalarOp<T>(Lhs), Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  friend NewRHOp<GetScalarOp<T>, std::plus, Indexes...>
+  operator+(const T &Lhs, const SwizzleOp &Rhs) {
+    return NewRHOp<GetScalarOp<T>, std::plus, Indexes...>(
+        Rhs.m_Vector, GetScalarOp<T>(Lhs), Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  friend NewRHOp<GetScalarOp<T>, std::divides, Indexes...>
+  operator/(const T &Lhs, const SwizzleOp &Rhs) {
+    return NewRHOp<GetScalarOp<T>, std::divides, Indexes...>(
+        Rhs.m_Vector, GetScalarOp<T>(Lhs), Rhs);
+  }
+
+  // TODO: Check that Rhs arg is suitable.
+#ifdef __SYCL_OPASSIGN
+#error "Undefine __SYCL_OPASSIGN macro."
+#endif
+#define __SYCL_OPASSIGN(OPASSIGN, OP)                                          \
+  SwizzleOp &operator OPASSIGN(const DataT &Rhs) {                             \
+    operatorHelper<OP>(vec<DataT, getNumElements()>(Rhs));                     \
+    return *this;                                                              \
+  }                                                                            \
+  template <typename RhsOperation>                                             \
+  SwizzleOp &operator OPASSIGN(const RhsOperation &Rhs) {                      \
+    operatorHelper<OP>(Rhs);                                                   \
+    return *this;                                                              \
+  }
+
+  __SYCL_OPASSIGN(+=, std::plus)
+  __SYCL_OPASSIGN(-=, std::minus)
+  __SYCL_OPASSIGN(*=, std::multiplies)
+  __SYCL_OPASSIGN(/=, std::divides)
+  __SYCL_OPASSIGN(%=, std::modulus)
+  __SYCL_OPASSIGN(&=, std::bit_and)
+  __SYCL_OPASSIGN(|=, std::bit_or)
+  __SYCL_OPASSIGN(^=, std::bit_xor)
+  __SYCL_OPASSIGN(>>=, RShift)
+  __SYCL_OPASSIGN(<<=, LShift)
+#undef __SYCL_OPASSIGN
+
+#ifdef __SYCL_UOP
+#error "Undefine __SYCL_UOP macro"
+#endif
+#define __SYCL_UOP(UOP, OPASSIGN)                                              \
+  SwizzleOp &operator UOP() {                                                  \
+    *this OPASSIGN static_cast<DataT>(1);                                      \
+    return *this;                                                              \
+  }                                                                            \
+  vec<DataT, getNumElements()> operator UOP(int) {                             \
+    vec<DataT, getNumElements()> Ret = *this;                                  \
+    *this OPASSIGN static_cast<DataT>(1);                                      \
+    return Ret;                                                                \
+  }
+
+  __SYCL_UOP(++, +=)
+  __SYCL_UOP(--, -=)
+#undef __SYCL_UOP
+
+  template <typename T = DataT>
+  typename std::enable_if<std::is_integral<T>::value,
+                          vec<T, getNumElements()>>::type
+  operator~() {
+    vec<T, getNumElements()> Tmp = *this;
+    return ~Tmp;
+  }
+
+  vec<rel_t, getNumElements()> operator!() {
+    vec<DataT, getNumElements()> Tmp = *this;
+    return !Tmp;
+  }
+
+  template <int IdxNum = getNumElements(),
+            typename = EnableIfMultipleIndexes<IdxNum>>
+  SwizzleOp &operator=(const vec<DataT, IdxNum> &Rhs) {
+    std::array<int, IdxNum> Idxs{Indexes...};
+    for (size_t I = 0; I < Idxs.size(); ++I) {
+      m_Vector->setValue(Idxs[I], Rhs.getValue(I));
+    }
+    return *this;
+  }
+
+  template <int IdxNum = getNumElements(), typename = EnableIfOneIndex<IdxNum>>
+  SwizzleOp &operator=(const DataT &Rhs) {
+    std::array<int, IdxNum> Idxs{Indexes...};
+    m_Vector->setValue(Idxs[0], Rhs);
+    return *this;
+  }
+
+  template <int IdxNum = getNumElements(), typename = EnableIfOneIndex<IdxNum>>
+  SwizzleOp &operator=(DataT &&Rhs) {
+    std::array<int, IdxNum> Idxs{Indexes...};
+    m_Vector->setValue(Idxs[0], Rhs);
+    return *this;
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewLHOp<GetScalarOp<T>, std::multiplies, Indexes...>
+  operator*(const T &Rhs) const {
+    return NewLHOp<GetScalarOp<T>, std::multiplies, Indexes...>(
+        m_Vector, *this, GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewLHOp<RhsOperation, std::multiplies, Indexes...>
+  operator*(const RhsOperation &Rhs) const {
+    return NewLHOp<RhsOperation, std::multiplies, Indexes...>(m_Vector, *this,
+                                                              Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewLHOp<GetScalarOp<T>, std::plus, Indexes...> operator+(const T &Rhs) const {
+    return NewLHOp<GetScalarOp<T>, std::plus, Indexes...>(m_Vector, *this,
+                                                          GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewLHOp<RhsOperation, std::plus, Indexes...>
+  operator+(const RhsOperation &Rhs) const {
+    return NewLHOp<RhsOperation, std::plus, Indexes...>(m_Vector, *this, Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewLHOp<GetScalarOp<T>, std::minus, Indexes...>
+  operator-(const T &Rhs) const {
+    return NewLHOp<GetScalarOp<T>, std::minus, Indexes...>(m_Vector, *this,
+                                                           GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewLHOp<RhsOperation, std::minus, Indexes...>
+  operator-(const RhsOperation &Rhs) const {
+    return NewLHOp<RhsOperation, std::minus, Indexes...>(m_Vector, *this, Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewLHOp<GetScalarOp<T>, std::divides, Indexes...>
+  operator/(const T &Rhs) const {
+    return NewLHOp<GetScalarOp<T>, std::divides, Indexes...>(
+        m_Vector, *this, GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewLHOp<RhsOperation, std::divides, Indexes...>
+  operator/(const RhsOperation &Rhs) const {
+    return NewLHOp<RhsOperation, std::divides, Indexes...>(m_Vector, *this,
+                                                           Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewLHOp<GetScalarOp<T>, std::bit_and, Indexes...>
+  operator&(const T &Rhs) const {
+    return NewLHOp<GetScalarOp<T>, std::bit_and, Indexes...>(
+        m_Vector, *this, GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewLHOp<RhsOperation, std::bit_and, Indexes...>
+  operator&(const RhsOperation &Rhs) const {
+    return NewLHOp<RhsOperation, std::bit_and, Indexes...>(m_Vector, *this,
+                                                           Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewLHOp<GetScalarOp<T>, std::bit_or, Indexes...>
+  operator|(const T &Rhs) const {
+    return NewLHOp<GetScalarOp<T>, std::bit_or, Indexes...>(
+        m_Vector, *this, GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewLHOp<RhsOperation, std::bit_or, Indexes...>
+  operator|(const RhsOperation &Rhs) const {
+    return NewLHOp<RhsOperation, std::bit_or, Indexes...>(m_Vector, *this, Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewLHOp<GetScalarOp<T>, std::bit_xor, Indexes...>
+  operator^(const T &Rhs) const {
+    return NewLHOp<GetScalarOp<T>, std::bit_xor, Indexes...>(
+        m_Vector, *this, GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewLHOp<RhsOperation, std::bit_xor, Indexes...>
+  operator^(const RhsOperation &Rhs) const {
+    return NewLHOp<RhsOperation, std::bit_xor, Indexes...>(m_Vector, *this,
+                                                           Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewLHOp<GetScalarOp<T>, RShift, Indexes...> operator>>(const T &Rhs) const {
+    return NewLHOp<GetScalarOp<T>, RShift, Indexes...>(m_Vector, *this,
+                                                       GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewLHOp<RhsOperation, RShift, Indexes...>
+  operator>>(const RhsOperation &Rhs) const {
+    return NewLHOp<RhsOperation, RShift, Indexes...>(m_Vector, *this, Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewLHOp<GetScalarOp<T>, LShift, Indexes...> operator<<(const T &Rhs) const {
+    return NewLHOp<GetScalarOp<T>, LShift, Indexes...>(m_Vector, *this,
+                                                       GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewLHOp<RhsOperation, LShift, Indexes...>
+  operator<<(const RhsOperation &Rhs) const {
+    return NewLHOp<RhsOperation, LShift, Indexes...>(m_Vector, *this, Rhs);
+  }
+
+  template <typename T1, typename T2, typename T3, template <typename> class T4,
+            int... T5,
+            typename = typename std::enable_if<sizeof...(T5) ==
+                                               getNumElements()>::type>
+  SwizzleOp &operator=(const SwizzleOp<T1, T2, T3, T4, T5...> &Rhs) {
+    std::array<int, getNumElements()> Idxs{Indexes...};
+    for (size_t I = 0; I < Idxs.size(); ++I) {
+      m_Vector->setValue(Idxs[I], Rhs.getValue(I));
+    }
+    return *this;
+  }
+
+  template <typename T1, typename T2, typename T3, template <typename> class T4,
+            int... T5,
+            typename = typename std::enable_if<sizeof...(T5) ==
+                                               getNumElements()>::type>
+  SwizzleOp &operator=(SwizzleOp<T1, T2, T3, T4, T5...> &&Rhs) {
+    std::array<int, getNumElements()> Idxs{Indexes...};
+    for (size_t I = 0; I < Idxs.size(); ++I) {
+      m_Vector->setValue(Idxs[I], Rhs.getValue(I));
+    }
+    return *this;
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewRelOp<GetScalarOp<T>, EqualTo, Indexes...> operator==(const T &Rhs) const {
+    return NewRelOp<GetScalarOp<T>, EqualTo, Indexes...>(NULL, *this,
+                                                         GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewRelOp<RhsOperation, EqualTo, Indexes...>
+  operator==(const RhsOperation &Rhs) const {
+    return NewRelOp<RhsOperation, EqualTo, Indexes...>(NULL, *this, Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewRelOp<GetScalarOp<T>, NotEqualTo, Indexes...>
+  operator!=(const T &Rhs) const {
+    return NewRelOp<GetScalarOp<T>, NotEqualTo, Indexes...>(
+        NULL, *this, GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewRelOp<RhsOperation, NotEqualTo, Indexes...>
+  operator!=(const RhsOperation &Rhs) const {
+    return NewRelOp<RhsOperation, NotEqualTo, Indexes...>(NULL, *this, Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewRelOp<GetScalarOp<T>, GreaterEqualTo, Indexes...>
+  operator>=(const T &Rhs) const {
+    return NewRelOp<GetScalarOp<T>, GreaterEqualTo, Indexes...>(
+        NULL, *this, GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewRelOp<RhsOperation, GreaterEqualTo, Indexes...>
+  operator>=(const RhsOperation &Rhs) const {
+    return NewRelOp<RhsOperation, GreaterEqualTo, Indexes...>(NULL, *this, Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewRelOp<GetScalarOp<T>, LessEqualTo, Indexes...>
+  operator<=(const T &Rhs) const {
+    return NewRelOp<GetScalarOp<T>, LessEqualTo, Indexes...>(
+        NULL, *this, GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewRelOp<RhsOperation, LessEqualTo, Indexes...>
+  operator<=(const RhsOperation &Rhs) const {
+    return NewRelOp<RhsOperation, LessEqualTo, Indexes...>(NULL, *this, Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewRelOp<GetScalarOp<T>, GreaterThan, Indexes...>
+  operator>(const T &Rhs) const {
+    return NewRelOp<GetScalarOp<T>, GreaterThan, Indexes...>(
+        NULL, *this, GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewRelOp<RhsOperation, GreaterThan, Indexes...>
+  operator>(const RhsOperation &Rhs) const {
+    return NewRelOp<RhsOperation, GreaterThan, Indexes...>(NULL, *this, Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewRelOp<GetScalarOp<T>, LessThan, Indexes...> operator<(const T &Rhs) const {
+    return NewRelOp<GetScalarOp<T>, LessThan, Indexes...>(NULL, *this,
+                                                          GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewRelOp<RhsOperation, LessThan, Indexes...>
+  operator<(const RhsOperation &Rhs) const {
+    return NewRelOp<RhsOperation, LessThan, Indexes...>(NULL, *this, Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewRelOp<GetScalarOp<T>, LogicalAnd, Indexes...>
+  operator&&(const T &Rhs) const {
+    return NewRelOp<GetScalarOp<T>, LogicalAnd, Indexes...>(
+        NULL, *this, GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewRelOp<RhsOperation, LogicalAnd, Indexes...>
+  operator&&(const RhsOperation &Rhs) const {
+    return NewRelOp<RhsOperation, LogicalAnd, Indexes...>(NULL, *this, Rhs);
+  }
+
+  template <typename T, typename = EnableIfScalarType<T>>
+  NewRelOp<GetScalarOp<T>, LogicalOr, Indexes...>
+  operator||(const T &Rhs) const {
+    return NewRelOp<GetScalarOp<T>, LogicalOr, Indexes...>(NULL, *this,
+                                                           GetScalarOp<T>(Rhs));
+  }
+
+  template <typename RhsOperation,
+            typename = EnableIfNoScalarType<RhsOperation>>
+  NewRelOp<RhsOperation, LogicalOr, Indexes...>
+  operator||(const RhsOperation &Rhs) const {
+    return NewRelOp<RhsOperation, LogicalOr, Indexes...>(NULL, *this, Rhs);
+  }
+
+  // Begin hi/lo, even/odd, xyzw, and rgba swizzles.
+private:
+  // Indexer used in the swizzles.def. C++14
+  static constexpr int Indexer(int index) {
+    const constexpr int IDXs[] = {Indexes...};
+    return IDXs[index >= getNumElements() ? 0 : index];
+  }
+
+public:
+#ifdef __SYCL_ACCESS_RETURN
+#error "Undefine __SYCL_ACCESS_RETURN macro"
+#endif
+#define __SYCL_ACCESS_RETURN m_Vector
+#include "swizzles.def"
+#undef __SYCL_ACCESS_RETURN
+  // End of hi/lo, even/odd, xyzw, and rgba swizzles.
+
+  // TODO: make templated address space to work.
+  // Somehow, access<> to multi_ptr<> conversion doesn't work w/o making
+  // address space explicitly specified.
+  //
+  // Leave store() interface to automatic conversion to vec<>.
+  // Load to vec<DataT, getNumElements()> and then assign to swizzle.
+#ifdef __SYCL_LOAD
+#error "Undefine __SYCL_LOAD macro"
+#endif
+#define __SYCL_LOAD(Space)                                                     \
+  void load(size_t offset, multi_ptr<DataT, Space> ptr) {                      \
+    vec<DataT, getNumElements()> Tmp;                                          \
+    Tmp.template load(offset, ptr);                                            \
+    *this = Tmp;                                                               \
+  }
+
+  __SYCL_LOAD(access::address_space::global_space)
+  __SYCL_LOAD(access::address_space::local_space)
+  __SYCL_LOAD(access::address_space::constant_space)
+  __SYCL_LOAD(access::address_space::private_space)
+#undef __SYCL_LOAD
+
+  template <typename convertT, rounding_mode roundingMode>
+  vec<convertT, getNumElements()> convert() const {
+    // First materialize the swizzle to vec<DataT, getNumElements()>
+    // and then apply convert() to it.
+    vec<DataT, getNumElements()> Tmp = *this;
+    return Tmp.template convert<convertT, roundingMode>();
+  }
+
+  template <typename asT>
+  typename std::enable_if<asT::getNumElements() == getNumElements(), asT>::type
+  as() const {
+    // First materialize the swizzle to vec<DataT, getNumElements()>
+    // and then apply as() to it.
+    vec<DataT, getNumElements()> Tmp = *this;
+    return Tmp.template as<asT>();
+  }
+
+private:
+  SwizzleOp(const SwizzleOp &Rhs)
+      : m_Vector(Rhs.m_Vector), m_LeftOperation(Rhs.m_LeftOperation),
+        m_RightOperation(Rhs.m_RightOperation) {}
+
+  SwizzleOp(VecT *Vector, OperationLeftT LeftOperation,
+            OperationRightT RightOperation)
+      : m_Vector(Vector), m_LeftOperation(LeftOperation),
+        m_RightOperation(RightOperation) {}
+
+  SwizzleOp(VecT *Vector) : m_Vector(Vector) {}
+
+  SwizzleOp(SwizzleOp &&Rhs)
+      : m_Vector(Rhs.m_Vector), m_LeftOperation(std::move(Rhs.m_LeftOperation)),
+        m_RightOperation(std::move(Rhs.m_RightOperation)) {}
+
+  // Either performing CurrentOperation on results of left and right operands
+  // or reading values from actual vector.
+  DataT getValue(size_t Index) const {
+    if (std::is_same<OperationCurrentT<DataT>, GetOp<DataT>>::value) {
+      std::array<int, getNumElements()> Idxs{Indexes...};
+      return m_Vector->getValue(Idxs[Index]);
+    }
+    auto Op = OperationCurrentT<DataT>();
+    return Op(m_LeftOperation.getValue(Index),
+              m_RightOperation.getValue(Index));
+  }
+
+  template <template <typename> class Operation, typename RhsOperation>
+  void operatorHelper(const RhsOperation &Rhs) {
+    Operation<DataT> Op;
+    std::array<int, getNumElements()> Idxs{Indexes...};
+    for (size_t I = 0; I < Idxs.size(); ++I) {
+      DataT Res = Op(m_Vector->getValue(Idxs[I]), Rhs.getValue(I));
+      m_Vector->setValue(Idxs[I], Res);
+    }
+  }
+
+  // fields
+  VecT *m_Vector;
+
+  OperationLeftT m_LeftOperation;
+  OperationRightT m_RightOperation;
+
+  // friends
+  template <typename T1, int T2> friend class cl::sycl::vec;
+
+  template <typename T1, typename T2, typename T3, template <typename> class T4,
+            int... T5>
+  friend class SwizzleOp;
+};
+} // namespace detail
+
+// scalar BINOP vec<>
+// scalar BINOP SwizzleOp
+// vec<> BINOP SwizzleOp
+#ifdef __SYCL_BINOP
+#error "Undefine __SYCL_BINOP macro"
+#endif
+#define __SYCL_BINOP(BINOP)                                                    \
+  template <typename T, int Num>                                               \
+  typename std::enable_if<std::is_fundamental<T>::value, vec<T, Num>>::type    \
+  operator BINOP(const T &Lhs, const vec<T, Num> &Rhs) {                       \
+    return vec<T, Num>(static_cast<T>(Lhs)) BINOP Rhs;                         \
+  }                                                                            \
+  template <typename VecT, typename OperationLeftT, typename OperationRightT,  \
+            template <typename> class OperationCurrentT, int... Indexes,       \
+            typename T, typename T1 = typename VecT::element_type,             \
+            int Num = sizeof...(Indexes)>                                      \
+  typename std::enable_if<std::is_convertible<T, T1>::value &&                 \
+                              std::is_fundamental<T>::value,                   \
+                          vec<T1, Num>>::type                                  \
+  operator BINOP(                                                              \
+      const T &Lhs,                                                            \
+      const detail::SwizzleOp<VecT, OperationLeftT, OperationRightT,           \
+                              OperationCurrentT, Indexes...> &Rhs) {           \
+    vec<T1, Num> Tmp = Rhs;                                                    \
+    return Lhs BINOP Tmp;                                                      \
+  }                                                                            \
+  template <typename VecT, typename OperationLeftT, typename OperationRightT,  \
+            template <typename> class OperationCurrentT, int... Indexes,       \
+            typename T = typename VecT::element_type,                          \
+            int Num = sizeof...(Indexes)>                                      \
+  vec<T, Num> operator BINOP(                                                  \
+      const vec<T, Num> &Lhs,                                                  \
+      const detail::SwizzleOp<VecT, OperationLeftT, OperationRightT,           \
+                              OperationCurrentT, Indexes...> &Rhs) {           \
+    vec<T, Num> Tmp = Rhs;                                                     \
+    return Lhs BINOP Tmp;                                                      \
+  }
+
+__SYCL_BINOP(+)
+__SYCL_BINOP(-)
+__SYCL_BINOP(*)
+__SYCL_BINOP(/)
+__SYCL_BINOP(&)
+__SYCL_BINOP(|)
+__SYCL_BINOP(^)
+__SYCL_BINOP(>>)
+__SYCL_BINOP(<<)
+#undef __SYCL_BINOP
+
+// scalar RELLOGOP vec<>
+// scalar RELLOGOP SwizzleOp
+// vec<> RELLOGOP SwizzleOp
+#ifdef __SYCL_RELLOGOP
+#error "Undefine __SYCL_RELLOGOP macro"
+#endif
+#define __SYCL_RELLOGOP(RELLOGOP)                                              \
+  template <typename T, typename DataT, int Num>                               \
+  typename std::enable_if<std::is_convertible<T, DataT>::value &&              \
+                              std::is_fundamental<T>::value,                   \
+                          vec<detail::rel_t<DataT>, Num>>::type                \
+  operator RELLOGOP(const T &Lhs, const vec<DataT, Num> &Rhs) {                \
+    return vec<T, Num>(static_cast<T>(Lhs)) RELLOGOP Rhs;                      \
+  }                                                                            \
+  template <typename VecT, typename OperationLeftT, typename OperationRightT,  \
+            template <typename> class OperationCurrentT, int... Indexes,       \
+            typename T, typename T1 = typename VecT::element_type,             \
+            int Num = sizeof...(Indexes)>                                      \
+  typename std::enable_if<std::is_convertible<T, T1>::value &&                 \
+                              std::is_fundamental<T>::value,                   \
+                          vec<detail::rel_t<T1>, Num>>::type                   \
+  operator RELLOGOP(                                                           \
+      const T &Lhs,                                                            \
+      const detail::SwizzleOp<VecT, OperationLeftT, OperationRightT,           \
+                              OperationCurrentT, Indexes...> &Rhs) {           \
+    vec<T1, Num> Tmp = Rhs;                                                    \
+    return Lhs RELLOGOP Tmp;                                                   \
+  }                                                                            \
+  template <typename VecT, typename OperationLeftT, typename OperationRightT,  \
+            template <typename> class OperationCurrentT, int... Indexes,       \
+            typename T = typename VecT::element_type,                          \
+            int Num = sizeof...(Indexes)>                                      \
+  vec<detail::rel_t<T>, Num> operator RELLOGOP(                                \
+      const vec<T, Num> &Lhs,                                                  \
+      const detail::SwizzleOp<VecT, OperationLeftT, OperationRightT,           \
+                              OperationCurrentT, Indexes...> &Rhs) {           \
+    vec<T, Num> Tmp = Rhs;                                                     \
+    return Lhs RELLOGOP Tmp;                                                   \
+  }
+
+__SYCL_RELLOGOP(==)
+__SYCL_RELLOGOP(!=)
+__SYCL_RELLOGOP(>)
+__SYCL_RELLOGOP(<)
+__SYCL_RELLOGOP(>=)
+__SYCL_RELLOGOP(<=)
+// TODO: limit to integral types.
+__SYCL_RELLOGOP(&&)
+__SYCL_RELLOGOP(||)
+#undef __SYCL_RELLOGOP
+} // namespace sycl
+} // namespace cl
+
+#ifdef __SYCL_DEVICE_ONLY__
+typedef char __char_t;
+typedef char __char2_vec_t __attribute__((ext_vector_type(2)));
+typedef char __char3_vec_t __attribute__((ext_vector_type(3)));
+typedef char __char4_vec_t __attribute__((ext_vector_type(4)));
+typedef char __char8_vec_t __attribute__((ext_vector_type(8)));
+typedef char __char16_vec_t __attribute__((ext_vector_type(16)));
+typedef signed char __schar_t;
+typedef signed char __schar2_vec_t __attribute__((ext_vector_type(2)));
+typedef signed char __schar3_vec_t __attribute__((ext_vector_type(3)));
+typedef signed char __schar4_vec_t __attribute__((ext_vector_type(4)));
+typedef signed char __schar8_vec_t __attribute__((ext_vector_type(8)));
+typedef signed char __schar16_vec_t __attribute__((ext_vector_type(16)));
+typedef unsigned char __uchar_t;
+typedef unsigned char __uchar2_vec_t __attribute__((ext_vector_type(2)));
+typedef unsigned char __uchar3_vec_t __attribute__((ext_vector_type(3)));
+typedef unsigned char __uchar4_vec_t __attribute__((ext_vector_type(4)));
+typedef unsigned char __uchar8_vec_t __attribute__((ext_vector_type(8)));
+typedef unsigned char __uchar16_vec_t __attribute__((ext_vector_type(16)));
+typedef short __short_t;
+typedef short __short2_vec_t __attribute__((ext_vector_type(2)));
+typedef short __short3_vec_t __attribute__((ext_vector_type(3)));
+typedef short __short4_vec_t __attribute__((ext_vector_type(4)));
+typedef short __short8_vec_t __attribute__((ext_vector_type(8)));
+typedef short __short16_vec_t __attribute__((ext_vector_type(16)));
+typedef unsigned short __ushort_t;
+typedef unsigned short __ushort2_vec_t __attribute__((ext_vector_type(2)));
+typedef unsigned short __ushort3_vec_t __attribute__((ext_vector_type(3)));
+typedef unsigned short __ushort4_vec_t __attribute__((ext_vector_type(4)));
+typedef unsigned short __ushort8_vec_t __attribute__((ext_vector_type(8)));
+typedef unsigned short __ushort16_vec_t __attribute__((ext_vector_type(16)));
+typedef int __int_t;
+typedef int __int2_vec_t __attribute__((ext_vector_type(2)));
+typedef int __int3_vec_t __attribute__((ext_vector_type(3)));
+typedef int __int4_vec_t __attribute__((ext_vector_type(4)));
+typedef int __int8_vec_t __attribute__((ext_vector_type(8)));
+typedef int __int16_vec_t __attribute__((ext_vector_type(16)));
+typedef unsigned int __uint_t;
+typedef unsigned int __uint2_vec_t __attribute__((ext_vector_type(2)));
+typedef unsigned int __uint3_vec_t __attribute__((ext_vector_type(3)));
+typedef unsigned int __uint4_vec_t __attribute__((ext_vector_type(4)));
+typedef unsigned int __uint8_vec_t __attribute__((ext_vector_type(8)));
+typedef unsigned int __uint16_vec_t __attribute__((ext_vector_type(16)));
+typedef long __long_t;
+typedef long __long2_vec_t __attribute__((ext_vector_type(2)));
+typedef long __long3_vec_t __attribute__((ext_vector_type(3)));
+typedef long __long4_vec_t __attribute__((ext_vector_type(4)));
+typedef long __long8_vec_t __attribute__((ext_vector_type(8)));
+typedef long __long16_vec_t __attribute__((ext_vector_type(16)));
+typedef unsigned long __ulong_t;
+typedef unsigned long __ulong2_vec_t __attribute__((ext_vector_type(2)));
+typedef unsigned long __ulong3_vec_t __attribute__((ext_vector_type(3)));
+typedef unsigned long __ulong4_vec_t __attribute__((ext_vector_type(4)));
+typedef unsigned long __ulong8_vec_t __attribute__((ext_vector_type(8)));
+typedef unsigned long __ulong16_vec_t __attribute__((ext_vector_type(16)));
+typedef long long __longlong_t;
+typedef long long __longlong2_vec_t __attribute__((ext_vector_type(2)));
+typedef long long __longlong3_vec_t __attribute__((ext_vector_type(3)));
+typedef long long __longlong4_vec_t __attribute__((ext_vector_type(4)));
+typedef long long __longlong8_vec_t __attribute__((ext_vector_type(8)));
+typedef long long __longlong16_vec_t __attribute__((ext_vector_type(16)));
+typedef unsigned long long __ulonglong_t;
+typedef unsigned long long __ulonglong2_vec_t
+    __attribute__((ext_vector_type(2)));
+typedef unsigned long long __ulonglong3_vec_t
+    __attribute__((ext_vector_type(3)));
+typedef unsigned long long __ulonglong4_vec_t
+    __attribute__((ext_vector_type(4)));
+typedef unsigned long long __ulonglong8_vec_t
+    __attribute__((ext_vector_type(8)));
+typedef unsigned long long __ulonglong16_vec_t
+    __attribute__((ext_vector_type(16)));
+typedef float __float_t;
+typedef float __float2_vec_t __attribute__((ext_vector_type(2)));
+typedef float __float3_vec_t __attribute__((ext_vector_type(3)));
+typedef float __float4_vec_t __attribute__((ext_vector_type(4)));
+typedef float __float8_vec_t __attribute__((ext_vector_type(8)));
+typedef float __float16_vec_t __attribute__((ext_vector_type(16)));
+// TODO: Add support for half builtin type in clang for sycl target.
+// typedef half __half_t;
+// typedef half __half2_vec_t __attribute__((ext_vector_type(2)));
+// typedef half __half3_vec_t __attribute__((ext_vector_type(3)));
+// typedef half __half4_vec_t __attribute__((ext_vector_type(4)));
+// typedef half __half8_vec_t __attribute__((ext_vector_type(8)));
+// typedef half __half16_vec_t __attribute__((ext_vector_type(16)));
+typedef double __double_t;
+typedef double __double2_vec_t __attribute__((ext_vector_type(2)));
+typedef double __double3_vec_t __attribute__((ext_vector_type(3)));
+typedef double __double4_vec_t __attribute__((ext_vector_type(4)));
+typedef double __double8_vec_t __attribute__((ext_vector_type(8)));
+typedef double __double16_vec_t __attribute__((ext_vector_type(16)));
+
+#define GET_CL_TYPE(target, num) __##target##num##_vec_t
+#define GET_SCALAR_CL_TYPE(target) target
+#else // __SYCL_DEVICE_ONLY__
+// For signed char. OpenCL doesn't have any type about `signed char`, therefore
+// we use type alias of cl_char instead.
+using cl_schar = cl_char;
+using cl_schar2 = cl_char2;
+using cl_schar3 = cl_char3;
+using cl_schar4 = cl_char4;
+using cl_schar8 = cl_char8;
+using cl_schar16 = cl_char16;
+
+#define GET_CL_TYPE(target, num) cl_##target##num
+#define GET_SCALAR_CL_TYPE(target) cl_##target
+#endif // __SYCL_DEVICE_ONLY__
+
+namespace cl {
+namespace sycl {
+
+#define DECLARE_CONVERTER(base, num)                                           \
+  template <> class BaseCLTypeConverter<base, num> {                           \
+  public:                                                                      \
+    using DataType = GET_CL_TYPE(base, num);                                   \
+  };
+
+#define DECLARE_VECTOR_CONVERTERS(base)                                        \
+  namespace detail {                                                           \
+  DECLARE_CONVERTER(base, 2)                                                   \
+  DECLARE_CONVERTER(base, 3)                                                   \
+  DECLARE_CONVERTER(base, 4)                                                   \
+  DECLARE_CONVERTER(base, 8)                                                   \
+  DECLARE_CONVERTER(base, 16)                                                  \
+  template <> class BaseCLTypeConverter<base, 1> {                             \
+  public:                                                                      \
+    using DataType = GET_SCALAR_CL_TYPE(base);                                 \
+  };                                                                           \
+  } // namespace detail
+
+#define DECLARE_SYCL_VEC_WO_CONVERTERS(base)                                   \
+  using cl_##base##16 = vec<base, 16>;                                         \
+  using cl_##base##8 = vec<base, 8>;                                           \
+  using cl_##base##4 = vec<base, 4>;                                           \
+  using cl_##base##3 = vec<base, 3>;                                           \
+  using cl_##base##2 = vec<base, 2>;                                           \
+  using cl_##base = GET_SCALAR_CL_TYPE(base);                                  \
+  using base##16 = cl_##base##16;                                              \
+  using base##8 = cl_##base##8;                                                \
+  using base##4 = cl_##base##4;                                                \
+  using base##3 = cl_##base##3;                                                \
+  using base##2 = cl_##base##2;
+
+#define DECLARE_SYCL_VEC(base)                                                 \
+  DECLARE_VECTOR_CONVERTERS(base)                                              \
+  DECLARE_SYCL_VEC_WO_CONVERTERS(base)
+
+DECLARE_SYCL_VEC(char)
+DECLARE_SYCL_VEC(schar)
+DECLARE_SYCL_VEC(uchar)
+DECLARE_SYCL_VEC(short)
+DECLARE_SYCL_VEC(ushort)
+DECLARE_SYCL_VEC(int)
+DECLARE_SYCL_VEC(uint)
+DECLARE_SYCL_VEC(long)
+DECLARE_SYCL_VEC(ulong)
+// TODO: Fix long long and unsigned long long.
+// DECLARE_SYCL_VEC(longlong)
+// DECLARE_SYCL_VEC(ulonglong)
+DECLARE_SYCL_VEC(float)
+DECLARE_SYCL_VEC(double)
+// TODO: Fix half.
+typedef ::cl_half half;
+DECLARE_SYCL_VEC_WO_CONVERTERS(half)
+
+using cl_bool = ::cl_bool;
+using byte = uchar;
+
+#undef GET_CL_TYPE
+#undef GET_SCALAR_CL_TYPE
+#undef DECLARE_CONVERTER
+#undef DECLARE_VECTOR_CONVERTERS
+#undef DECLARE_SYCL_VEC
+#undef DECLARE_SYCL_VEC_WO_CONVERTERS
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/context.cpp b/sycl/source/context.cpp
new file mode 100644
index 000000000000..816c491a4e2c
--- /dev/null
+++ b/sycl/source/context.cpp
@@ -0,0 +1,31 @@
+//==---------------- context.cpp - SYCL context ----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/context.hpp>
+
+namespace cl {
+namespace sycl {
+  context::context(const vector_class<device> &deviceList,
+          async_handler asyncHandler) {
+    if (deviceList.empty())
+      throw invalid_parameter_error("First argument deviceList is empty.");
+
+    if (deviceList[0].is_host()) {
+      impl =
+          std::make_shared<detail::context_host>(deviceList[0], asyncHandler);
+    } else {
+      impl = std::make_shared<detail::context_opencl>(deviceList, asyncHandler);
+    }
+  }
+
+  context::context(cl_context clContext, async_handler asyncHandler) {
+    impl = std::make_shared<detail::context_opencl>(clContext, asyncHandler);
+  }
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/common.cpp b/sycl/source/detail/common.cpp
new file mode 100644
index 000000000000..8570f7203ef3
--- /dev/null
+++ b/sycl/source/detail/common.cpp
@@ -0,0 +1,243 @@
+//==----------- common.cpp -----------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/common_info.hpp>
+
+const char *stringifyErrorCode(cl_int error) {
+  switch (error) {
+    case CL_INVALID_ACCELERATOR_INTEL:
+      return "CL_INVALID_ACCELERATOR_INTEL";
+    case CL_INVALID_ACCELERATOR_TYPE_INTEL:
+      return "CL_INVALID_ACCELERATOR_TYPE_INTEL";
+    case CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL:
+      return "CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL";
+    case CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL:
+      return "CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL";
+    case CL_PLATFORM_NOT_FOUND_KHR:
+      return "CL_PLATFORM_NOT_FOUND_KHR";
+    case CL_DEVICE_PARTITION_FAILED_EXT:
+      return "CL_DEVICE_PARTITION_FAILED_EXT";
+    case CL_INVALID_PARTITION_COUNT_EXT:
+      return "CL_INVALID_PARTITION_COUNT_EXT";
+    case CL_INVALID_PARTITION_NAME_EXT:
+      return "CL_INVALID_PARTITION_NAME_EXT";
+      /*    case CL_INVALID_DX9_DEVICE_INTEL:
+            return "CL_INVALID_DX9_DEVICE_INTEL";
+          case CL_INVALID_DX9_RESOURCE_INTEL:
+            return "CL_INVALID_DX9_RESOURCE_INTEL";
+          case CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL:
+            return "CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL";
+          case CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL:
+            return "CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL";
+          case CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR:
+            return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+      */
+    case CL_SUCCESS:
+      return "CL_SUCCESS";
+    case CL_DEVICE_NOT_FOUND:
+      return "CL_DEVICE_NOT_FOUND";
+    case CL_DEVICE_NOT_AVAILABLE:
+      return "CL_DEVICE_NOT_AVAILABLE";
+    case CL_COMPILER_NOT_AVAILABLE:
+      return "CL_COMPILER_NOT_AVAILABLE";
+    case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+      return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+    case CL_OUT_OF_RESOURCES:
+      return "CL_OUT_OF_RESOURCES";
+    case CL_OUT_OF_HOST_MEMORY:
+      return "CL_OUT_OF_HOST_MEMORY";
+    case CL_PROFILING_INFO_NOT_AVAILABLE:
+      return "CL_PROFILING_INFO_NOT_AVAILABLE";
+    case CL_MEM_COPY_OVERLAP:
+      return "CL_MEM_COPY_OVERLAP";
+    case CL_IMAGE_FORMAT_MISMATCH:
+      return "CL_IMAGE_FORMAT_MISMATCH";
+    case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+      return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+    case CL_BUILD_PROGRAM_FAILURE:
+      return "CL_BUILD_PROGRAM_FAILURE";
+    case CL_MAP_FAILURE:
+      return "CL_MAP_FAILURE";
+    case CL_MISALIGNED_SUB_BUFFER_OFFSET:
+      return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+    case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
+      return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+    case CL_COMPILE_PROGRAM_FAILURE:
+      return "CL_COMPILE_PROGRAM_FAILURE";
+    case CL_LINKER_NOT_AVAILABLE:
+      return "CL_LINKER_NOT_AVAILABLE";
+    case CL_LINK_PROGRAM_FAILURE:
+      return "CL_LINK_PROGRAM_FAILURE";
+    case CL_DEVICE_PARTITION_FAILED:
+      return "CL_DEVICE_PARTITION_FAILED";
+    case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
+      return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+    case CL_INVALID_VALUE:
+      return "CL_INVALID_VALUE";
+    case CL_INVALID_DEVICE_TYPE:
+      return "CL_INVALID_DEVICE_TYPE";
+    case CL_INVALID_PLATFORM:
+      return "CL_INVALID_PLATFORM";
+    case CL_INVALID_DEVICE:
+      return "CL_INVALID_DEVICE";
+    case CL_INVALID_CONTEXT:
+      return "CL_INVALID_CONTEXT";
+    case CL_INVALID_QUEUE_PROPERTIES:
+      return "CL_INVALID_QUEUE_PROPERTIES";
+    case CL_INVALID_COMMAND_QUEUE:
+      return "CL_INVALID_COMMAND_QUEUE";
+    case CL_INVALID_HOST_PTR:
+      return "CL_INVALID_HOST_PTR";
+    case CL_INVALID_MEM_OBJECT:
+      return "CL_INVALID_MEM_OBJECT";
+    case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+      return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+    case CL_INVALID_IMAGE_SIZE:
+      return "CL_INVALID_IMAGE_SIZE";
+    case CL_INVALID_SAMPLER:
+      return "CL_INVALID_SAMPLER";
+    case CL_INVALID_BINARY:
+      return "CL_INVALID_BINARY";
+    case CL_INVALID_BUILD_OPTIONS:
+      return "CL_INVALID_BUILD_OPTIONS";
+    case CL_INVALID_PROGRAM:
+      return "CL_INVALID_PROGRAM";
+    case CL_INVALID_PROGRAM_EXECUTABLE:
+      return "CL_INVALID_PROGRAM_EXECUTABLE";
+    case CL_INVALID_KERNEL_NAME:
+      return "CL_INVALID_KERNEL_NAME";
+    case CL_INVALID_KERNEL_DEFINITION:
+      return "CL_INVALID_KERNEL_DEFINITION";
+    case CL_INVALID_KERNEL:
+      return "CL_INVALID_KERNEL";
+    case CL_INVALID_ARG_INDEX:
+      return "CL_INVALID_ARG_INDEX";
+    case CL_INVALID_ARG_VALUE:
+      return "CL_INVALID_ARG_VALUE";
+    case CL_INVALID_ARG_SIZE:
+      return "CL_INVALID_ARG_SIZE";
+    case CL_INVALID_KERNEL_ARGS:
+      return "CL_INVALID_KERNEL_ARGS";
+    case CL_INVALID_WORK_DIMENSION:
+      return "CL_INVALID_WORK_DIMENSION";
+    case CL_INVALID_WORK_GROUP_SIZE:
+      return "CL_INVALID_WORK_GROUP_SIZE";
+    case CL_INVALID_WORK_ITEM_SIZE:
+      return "CL_INVALID_WORK_ITEM_SIZE";
+    case CL_INVALID_GLOBAL_OFFSET:
+      return "CL_INVALID_GLOBAL_OFFSET";
+    case CL_INVALID_EVENT_WAIT_LIST:
+      return "CL_INVALID_EVENT_WAIT_LIST";
+    case CL_INVALID_EVENT:
+      return "CL_INVALID_EVENT";
+    case CL_INVALID_OPERATION:
+      return "CL_INVALID_OPERATION";
+    case CL_INVALID_GL_OBJECT:
+      return "CL_INVALID_GL_OBJECT";
+    case CL_INVALID_BUFFER_SIZE:
+      return "CL_INVALID_BUFFER_SIZE";
+    case CL_INVALID_MIP_LEVEL:
+      return "CL_INVALID_MIP_LEVEL";
+    case CL_INVALID_GLOBAL_WORK_SIZE:
+      return "CL_INVALID_GLOBAL_WORK_SIZE";
+    case CL_INVALID_PROPERTY:
+      return "CL_INVALID_PROPERTY";
+    case CL_INVALID_IMAGE_DESCRIPTOR:
+      return "CL_INVALID_IMAGE_DESCRIPTOR";
+    case CL_INVALID_COMPILER_OPTIONS:
+      return "CL_INVALID_COMPILER_OPTIONS";
+    case CL_INVALID_LINKER_OPTIONS:
+      return "CL_INVALID_LINKER_OPTIONS";
+    case CL_INVALID_DEVICE_PARTITION_COUNT:
+      return "CL_INVALID_DEVICE_PARTITION_COUNT";
+    case CL_INVALID_PIPE_SIZE:
+      return "CL_INVALID_PIPE_SIZE";
+    case CL_INVALID_DEVICE_QUEUE:
+      return "CL_INVALID_DEVICE_QUEUE";
+    case CL_INVALID_SPEC_ID:
+      return "CL_INVALID_SPEC_ID";
+    case CL_MAX_SIZE_RESTRICTION_EXCEEDED:
+      return "CL_MAX_SIZE_RESTRICTION_EXCEEDED";
+    /*
+        case CL_BUILD_NONE:
+          return "CL_BUILD_NONE";
+        case CL_BUILD_ERROR:
+          return "CL_BUILD_ERROR";
+        case CL_BUILD_IN_PROGRESS:
+          return "CL_BUILD_IN_PROGRESS";
+        case CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL:
+          return "CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL";
+        case CL_INVALID_VA_API_MEDIA_SURFACE_INTEL:
+          return "CL_INVALID_VA_API_MEDIA_SURFACE_INTEL";
+        case CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL:
+          return "CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL";
+        case CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL:
+          return "CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL";
+        case CL_INVALID_EGL_OBJECT_KHR:
+          return "CL_INVALID_EGL_OBJECT_KHR";
+        case CL_EGL_RESOURCE_NOT_ACQUIRED_KHR:
+          return "CL_EGL_RESOURCE_NOT_ACQUIRED_KHR";
+        case CL_INVALID_D3D11_DEVICE_KHR:
+          return "CL_INVALID_D3D11_DEVICE_KHR";
+        case CL_INVALID_D3D11_RESOURCE_KHR:
+          return "CL_INVALID_D3D11_RESOURCE_KHR";
+        case CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR:
+          return "CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR";
+        case CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR:
+          return "CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR";
+        case CL_INVALID_D3D10_DEVICE_KHR:
+          return "CL_INVALID_D3D10_DEVICE_KHR";
+        case CL_INVALID_D3D10_RESOURCE_KHR:
+          return "CL_INVALID_D3D10_RESOURCE_KHR";
+        case CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR:
+          return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
+        case CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR:
+          return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
+        case CL_INVALID_DX9_MEDIA_ADAPTER_KHR:
+          return "CL_INVALID_DX9_MEDIA_ADAPTER_KHR";
+        case CL_INVALID_DX9_MEDIA_SURFACE_KHR:
+          return "CL_INVALID_DX9_MEDIA_SURFACE_KHR";
+        case CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR:
+          return "CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR";
+        case CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR:
+          return "CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR";
+          */
+    default:
+      return "Unknown OpenCL error code";
+  }
+}
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+vector_class<string_class> split_string(const string_class &str,
+                                        char delimeter) {
+  vector_class<string_class> result;
+  size_t beg = 0;
+  size_t length = 0;
+  for (const auto &x : str) {
+    if (x == delimeter) {
+      result.push_back(str.substr(beg, length));
+      beg += length + 1;
+      length = 0;
+      continue;
+    }
+    length++;
+  }
+  if (length != 0) {
+    result.push_back(str.substr(beg, length));
+  }
+  return result;
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/device_info.cpp b/sycl/source/detail/device_info.cpp
new file mode 100644
index 000000000000..0125a054c0eb
--- /dev/null
+++ b/sycl/source/detail/device_info.cpp
@@ -0,0 +1,556 @@
+//==----------- device_info.cpp --------------------------------*- C ++-*---==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/detail/device_info.hpp>
+#include <CL/sycl/device.hpp>
+#include <chrono>
+#include <sys/sysinfo.h>
+#include <thread>
+
+#ifdef __GNUG__
+#define GCC_VERSION                                                            \
+  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+// Used by methods that duplicate OpenCL behaviour in order to get CPU info
+// TODO add Windows support
+// TODO add support for x86-64 ABI selected using ifdef.
+static void cpuid(unsigned int cpuid_info[], unsigned int type) {
+  unsigned int eax, ebx, ecx, edx;
+  __asm__ __volatile__("mov  %%ebx, %%edi\n\r"
+                       "cpuid\n\r"
+                       "xchg %%edi, %%ebx\n\r"
+                       : "=a"(eax), "=D"(ebx), "=c"(ecx), "=d"(edx)
+                       : "a"(type));
+  cpuid_info[0] = eax;
+  cpuid_info[1] = ebx;
+  cpuid_info[2] = ecx;
+  cpuid_info[3] = edx;
+}
+
+vector_class<info::fp_config> read_fp_bitfield(cl_device_fp_config bits) {
+  vector_class<info::fp_config> result;
+  if (bits & CL_FP_DENORM)
+    result.push_back(info::fp_config::denorm);
+  if (bits & CL_FP_INF_NAN)
+    result.push_back(info::fp_config::inf_nan);
+  if (bits & CL_FP_ROUND_TO_NEAREST)
+    result.push_back(info::fp_config::round_to_nearest);
+  if (bits & CL_FP_ROUND_TO_ZERO)
+    result.push_back(info::fp_config::round_to_zero);
+  if (bits & CL_FP_ROUND_TO_INF)
+    result.push_back(info::fp_config::round_to_inf);
+  if (bits & CL_FP_FMA)
+    result.push_back(info::fp_config::fma);
+  if (bits & CL_FP_SOFT_FLOAT)
+    result.push_back(info::fp_config::soft_float);
+  if (bits & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT)
+    result.push_back(info::fp_config::correctly_rounded_divide_sqrt);
+  return result;
+}
+
+vector_class<info::partition_affinity_domain>
+read_domain_bitfield(cl_device_affinity_domain bits) {
+  vector_class<info::partition_affinity_domain> result;
+  if (bits & CL_DEVICE_AFFINITY_DOMAIN_NUMA)
+    result.push_back(info::partition_affinity_domain::numa);
+  if (bits & CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE)
+    result.push_back(info::partition_affinity_domain::L4_cache);
+  if (bits & CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE)
+    result.push_back(info::partition_affinity_domain::L3_cache);
+  if (bits & CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE)
+    result.push_back(info::partition_affinity_domain::L2_cache);
+  if (bits & CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE)
+    result.push_back(info::partition_affinity_domain::L1_cache);
+  if (bits & CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE)
+    result.push_back(info::partition_affinity_domain::next_partitionable);
+  return result;
+}
+
+vector_class<info::execution_capability>
+read_execution_bitfield(cl_device_exec_capabilities bits) {
+  vector_class<info::execution_capability> result;
+  if (bits & CL_EXEC_KERNEL)
+    result.push_back(info::execution_capability::exec_kernel);
+  if (bits & CL_EXEC_NATIVE_KERNEL)
+    result.push_back(info::execution_capability::exec_native_kernel);
+  return result;
+}
+
+template <>
+info::device_type get_device_info_host<info::device::device_type>() {
+  return info::device_type::host;
+}
+
+template <> cl_uint get_device_info_host<info::device::vendor_id>() {
+  return 0x8086;
+}
+
+template <> cl_uint get_device_info_host<info::device::max_compute_units>() {
+  return std::thread::hardware_concurrency();
+}
+
+template <>
+cl_uint get_device_info_host<info::device::max_work_item_dimensions>() {
+  return 3;
+}
+
+template <> id<3> get_device_info_host<info::device::max_work_item_sizes>() {
+  // current value is the required minimum
+  return {1, 1, 1};
+}
+
+template <> size_t get_device_info_host<info::device::max_work_group_size>() {
+  // current value is the required minimum
+  return 1;
+}
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_char>() {
+  // TODO update when appropriate
+  return 1;
+}
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_short>() {
+  // TODO update when appropriate
+  return 1;
+}
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_int>() {
+  // TODO update when appropriate
+  return 1;
+}
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_long>() {
+  // TODO update when appropriate
+  return 1;
+}
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_float>() {
+  // TODO update when appropriate
+  return 1;
+}
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_double>() {
+  // TODO update when appropriate
+  return 1;
+}
+
+template <>
+cl_uint get_device_info_host<info::device::preferred_vector_width_half>() {
+  // TODO update when appropriate
+  return 0;
+}
+
+// SSE4.2 has 16 byte (XMM) registers
+static const cl_uint NATIVE_VECTOR_WIDTH_SSE42[] = {16, 8, 4, 2, 4, 2, 0};
+// AVX supports 32 byte (YMM) registers only for floats and doubles
+static const cl_uint NATIVE_VECTOR_WIDTH_AVX[] = {16, 8, 4, 2, 8, 4, 0};
+// AVX2 has a full set of 32 byte (YMM) registers
+static const cl_uint NATIVE_VECTOR_WIDTH_AVX2[] = {32, 16, 8, 4, 8, 4, 0};
+// AVX512 has 64 byte (ZMM) registers
+static const cl_uint NATIVE_VECTOR_WIDTH_AVX512[] = {64, 32, 16, 8, 16, 8, 0};
+
+cl_uint get_native_vector_width(size_t idx) {
+#if (__GNUG__ && GCC_VERSION > 40900)
+  if (__builtin_cpu_supports("avx512f")) {
+    return NATIVE_VECTOR_WIDTH_AVX512[idx];
+  }
+#endif
+
+  if (__builtin_cpu_supports("avx2")) {
+    return NATIVE_VECTOR_WIDTH_AVX2[idx];
+  }
+  if (__builtin_cpu_supports("avx")) {
+    return NATIVE_VECTOR_WIDTH_AVX[idx];
+  }
+  return NATIVE_VECTOR_WIDTH_SSE42[idx];
+}
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_char>() {
+  return get_native_vector_width(0);
+}
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_short>() {
+  return get_native_vector_width(1);
+}
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_int>() {
+  return get_native_vector_width(2);
+}
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_long>() {
+  return get_native_vector_width(3);
+}
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_float>() {
+  return get_native_vector_width(4);
+}
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_double>() {
+  return get_native_vector_width(5);
+}
+
+template <>
+cl_uint get_device_info_host<info::device::native_vector_width_half>() {
+  return get_native_vector_width(6);
+}
+
+template <> cl_uint get_device_info_host<info::device::max_clock_frequency>() {
+  throw runtime_error(
+      "max_clock_frequency parameter is not supported for host device");
+  unsigned int cpuInfo[4] = {0 - 1u};
+  string_class buff(sizeof(cpuInfo) * 3 + 1, 0);
+  size_t offset = 0;
+
+  for (unsigned int i = 0x80000002; i <= 0x80000004; i++) {
+    cpuid(cpuInfo, i);
+    std::copy(reinterpret_cast<char *>(cpuInfo),
+              reinterpret_cast<char *>(cpuInfo) + sizeof(cpuInfo),
+              buff.begin() + offset);
+    offset += sizeof(cpuInfo);
+  }
+  std::size_t found = buff.rfind("Hz");
+  // Bail out if frequency is not found in CPUID string
+  if (found == std::string::npos)
+    return 0;
+
+  buff = buff.substr(0, found);
+
+  cl_uint freq = 0;
+  switch (buff[buff.size() - 1]) {
+  case 'M':
+    freq = 1;
+    break;
+  case 'G':
+    freq = 1000;
+    break;
+  }
+  buff = buff.substr(buff.rfind(' '), buff.length());
+  freq *= std::stod(buff);
+  return freq;
+}
+
+template <> cl_uint get_device_info_host<info::device::address_bits>() {
+  return sizeof(void *) * 8;
+}
+
+template <> cl_ulong get_device_info_host<info::device::global_mem_size>() {
+  struct sysinfo meminfo;
+  sysinfo(&meminfo);
+  return meminfo.totalram * meminfo.mem_unit;
+}
+
+template <> cl_ulong get_device_info_host<info::device::max_mem_alloc_size>() {
+  // current value is the required minimum
+  const cl_ulong a = get_device_info_host<info::device::global_mem_size>() / 4;
+  const cl_ulong b = 128ul * 1024 * 1024;
+  return (a > b) ? a : b;
+}
+
+template <> bool get_device_info_host<info::device::image_support>() {
+  return true;
+}
+
+template <> cl_uint get_device_info_host<info::device::max_read_image_args>() {
+  // current value is the required minimum
+  return 128;
+}
+
+template <> cl_uint get_device_info_host<info::device::max_write_image_args>() {
+  // current value is the required minimum
+  return 8;
+}
+
+template <> size_t get_device_info_host<info::device::image2d_max_width>() {
+  // current value is the required minimum
+  return 8192;
+}
+
+template <> size_t get_device_info_host<info::device::image2d_max_height>() {
+  // current value is the required minimum
+  return 8192;
+}
+
+template <> size_t get_device_info_host<info::device::image3d_max_width>() {
+  // current value is the required minimum
+  return 2048;
+}
+
+template <> size_t get_device_info_host<info::device::image3d_max_height>() {
+  // current value is the required minimum
+  return 2048;
+}
+
+template <> size_t get_device_info_host<info::device::image3d_max_depth>() {
+  // current value is the required minimum
+  return 2048;
+}
+
+template <> size_t get_device_info_host<info::device::image_max_buffer_size>() {
+  // Not supported in SYCL
+  return 0;
+}
+
+template <> size_t get_device_info_host<info::device::image_max_array_size>() {
+  // current value is the required minimum
+  return 2048;
+}
+
+template <> cl_uint get_device_info_host<info::device::max_samplers>() {
+  // current value is the required minimum
+  return 16;
+}
+
+template <> size_t get_device_info_host<info::device::max_parameter_size>() {
+  // current value is the required minimum
+  return 1024;
+}
+
+template <> cl_uint get_device_info_host<info::device::mem_base_addr_align>() {
+  return 1024;
+}
+
+template <>
+vector_class<info::fp_config>
+get_device_info_host<info::device::half_fp_config>() {
+  // current value is the required minimum
+  return {};
+}
+
+template <>
+vector_class<info::fp_config>
+get_device_info_host<info::device::single_fp_config>() {
+  // current value is the required minimum
+  return {info::fp_config::round_to_nearest, info::fp_config::inf_nan};
+}
+
+template <>
+vector_class<info::fp_config>
+get_device_info_host<info::device::double_fp_config>() {
+  // current value is the required minimum
+  return {info::fp_config::fma,           info::fp_config::round_to_nearest,
+          info::fp_config::round_to_zero, info::fp_config::round_to_inf,
+          info::fp_config::inf_nan,       info::fp_config::denorm};
+}
+
+template <>
+info::global_mem_cache_type
+get_device_info_host<info::device::global_mem_cache_type>() {
+  return info::global_mem_cache_type::write_only;
+}
+
+template <>
+cl_uint get_device_info_host<info::device::global_mem_cache_line_size>() {
+  unsigned int viCPUInfo[4] = {(unsigned int)-1};
+  cpuid(viCPUInfo, 0x80000006);
+  return viCPUInfo[2] & 0xff;
+}
+
+template <>
+cl_ulong get_device_info_host<info::device::global_mem_cache_size>() {
+  unsigned int viCPUInfo[4] = {(unsigned int)-1};
+  cpuid(viCPUInfo, 0x80000006);
+  return ((viCPUInfo[2] >> 16) & 0xffff) * 1024;
+}
+
+template <>
+cl_ulong get_device_info_host<info::device::max_constant_buffer_size>() {
+  // current value is the required minimum
+  return 64 * 1024;
+}
+
+template <> cl_uint get_device_info_host<info::device::max_constant_args>() {
+  // current value is the required minimum
+  return 8;
+}
+
+template <>
+info::local_mem_type get_device_info_host<info::device::local_mem_type>() {
+  return info::local_mem_type::global;
+}
+
+template <> cl_ulong get_device_info_host<info::device::local_mem_size>() {
+  // current value is the required minimum
+  return 32 * 1024;
+}
+
+template <>
+bool get_device_info_host<info::device::error_correction_support>() {
+  return false;
+}
+
+template <> bool get_device_info_host<info::device::host_unified_memory>() {
+  return true;
+}
+
+template <>
+size_t get_device_info_host<info::device::profiling_timer_resolution>() {
+  typedef std::ratio_divide<std::chrono::high_resolution_clock::period,
+                            std::nano>
+      ns_period;
+  return ns_period::num / ns_period::den;
+}
+
+template <> bool get_device_info_host<info::device::is_endian_little>() {
+  union {
+    uint16_t a;
+    uint8_t b[2];
+  } u = {0x0100};
+
+  return u.b[1];
+}
+
+template <> bool get_device_info_host<info::device::is_available>() {
+  return true;
+}
+
+template <> bool get_device_info_host<info::device::is_compiler_available>() {
+  return true;
+}
+
+template <> bool get_device_info_host<info::device::is_linker_available>() {
+  return true;
+}
+
+template <>
+vector_class<info::execution_capability>
+get_device_info_host<info::device::execution_capabilities>() {
+  return {info::execution_capability::exec_kernel};
+}
+
+template <> bool get_device_info_host<info::device::queue_profiling>() {
+  return true;
+}
+
+template <>
+vector_class<string_class>
+get_device_info_host<info::device::built_in_kernels>() {
+  return {};
+}
+
+template <> platform get_device_info_host<info::device::platform>() {
+  return platform();
+}
+
+template <> string_class get_device_info_host<info::device::name>() {
+  return "SYCL host device";
+}
+
+template <> string_class get_device_info_host<info::device::vendor>() {
+  return "";
+}
+
+template <> string_class get_device_info_host<info::device::driver_version>() {
+  return "1.2";
+}
+
+template <> string_class get_device_info_host<info::device::profile>() {
+  return "FULL PROFILE";
+}
+
+template <> string_class get_device_info_host<info::device::version>() {
+  return "1.2";
+}
+
+template <>
+string_class get_device_info_host<info::device::opencl_c_version>() {
+  return "not applicable";
+}
+
+template <>
+vector_class<string_class> get_device_info_host<info::device::extensions>() {
+  // TODO update when appropriate
+  return {};
+}
+
+template <> size_t get_device_info_host<info::device::printf_buffer_size>() {
+  // current value is the required minimum
+  return 1024 * 1024;
+}
+
+template <>
+bool get_device_info_host<info::device::preferred_interop_user_sync>() {
+  return false;
+}
+
+template <> device get_device_info_host<info::device::parent_device>() {
+  // TODO: implement host device partitioning
+  throw runtime_error(
+      "Partitioning to subdevices of the host device is not implemented yet");
+}
+
+template <>
+cl_uint get_device_info_host<info::device::partition_max_sub_devices>() {
+  // TODO update once subdevice creation is enabled
+  return 1;
+}
+
+template <>
+vector_class<info::partition_property>
+get_device_info_host<info::device::partition_properties>() {
+  // TODO update once subdevice creation is enabled
+  return {};
+}
+
+template <>
+vector_class<info::partition_affinity_domain>
+get_device_info_host<info::device::partition_affinity_domains>() {
+  // TODO update once subdevice creation is enabled
+  return {};
+}
+
+template <>
+info::partition_property
+get_device_info_host<info::device::partition_type_property>() {
+  return info::partition_property::no_partition;
+}
+
+template <>
+info::partition_affinity_domain
+get_device_info_host<info::device::partition_type_affinity_domain>() {
+  // TODO update once subdevice creation is enabled
+  return info::partition_affinity_domain::not_applicable;
+}
+
+template <> cl_uint get_device_info_host<info::device::reference_count>() {
+  // TODO update once subdevice creation is enabled
+  return 1;
+}
+
+template <> cl_uint get_device_info_host<info::device::max_num_sub_groups>() {
+  // TODO update once subgroups are enabled
+  throw runtime_error("Sub-group feature is not supported on HOST device.");
+}
+
+template <>
+bool get_device_info_host<
+    info::device::sub_group_independent_forward_progress>() {
+  // TODO update once subgroups are enabled
+  throw runtime_error("Sub-group feature is not supported on HOST device.");
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
new file mode 100644
index 000000000000..6d9792da00cf
--- /dev/null
+++ b/sycl/source/detail/event_impl.cpp
@@ -0,0 +1,129 @@
+//==---------------- event_impl.cpp - SYCL event ---------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/event_impl.hpp>
+#include <CL/sycl/detail/scheduler/scheduler.h>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+// Threat all devices that don't support interoperability as host devices to
+// avoid attempts to call method get on such events.
+bool event_impl::is_host() const { return m_HostEvent || !m_OpenCLInterop; }
+
+cl_event event_impl::get() const {
+  if (m_OpenCLInterop) {
+    CHECK_OCL_CODE(clRetainEvent(m_Event));
+    return m_Event;
+  }
+  throw invalid_object_error(
+      "This instance of event doesn't support OpenCL interoperability.");
+}
+
+event_impl::~event_impl() {
+  if (!m_HostEvent) {
+    CHECK_OCL_CODE_NO_EXC(clReleaseEvent(m_Event));
+  }
+}
+
+void event_impl::waitInternal() const {
+  if (!m_HostEvent) {
+    CHECK_OCL_CODE(clWaitForEvents(1, &m_Event));
+  }
+  // Waiting of host events is NOP so far as all operations on host device
+  // are blocking.
+}
+
+cl_event &event_impl::getHandleRef() { return m_Event; }
+
+void event_impl::setIsHostEvent(bool Value) {
+  m_HostEvent = Value;
+  m_OpenCLInterop = !Value;
+}
+
+event_impl::event_impl(cl_event CLEvent, const context &SyclContext)
+    : m_Event(CLEvent), m_OpenCLInterop(true), m_HostEvent(false) {
+  CHECK_OCL_CODE(clRetainEvent(m_Event));
+  // TODO: Add check that CLEvent is bound to cl_context encapsulated in
+  // SyclContext.
+  if (SyclContext.is_host()) {
+    throw cl::sycl::invalid_parameter_error(
+        "The syclContext must match the OpenCL context associated with the "
+        "clEvent.");
+  }
+}
+
+void event_impl::wait(
+    std::shared_ptr<cl::sycl::detail::event_impl> Self) const {
+
+  if (m_Event || m_HostEvent)
+    // presence of m_Event means the command has been enqueued, so no need to
+    // go via the slow path event waiting in the scheduler
+    waitInternal();
+  else
+    simple_scheduler::Scheduler::getInstance().waitForEvent(Self);
+}
+
+template <>
+cl_ulong
+event_impl::get_profiling_info<info::event_profiling::command_submit>() const {
+  if (!m_HostEvent) {
+    return get_event_profiling_info_cl<
+        info::event_profiling::command_submit>::_(this->get());
+  }
+  assert(!"Not implemented for host device.");
+  return (cl_ulong)0;
+}
+
+template <>
+cl_ulong
+event_impl::get_profiling_info<info::event_profiling::command_start>() const {
+  if (!m_HostEvent) {
+    return get_event_profiling_info_cl<info::event_profiling::command_start>::_(
+        this->get());
+  }
+  assert(!"Not implemented for host device.");
+  return (cl_ulong)0;
+}
+
+template <>
+cl_ulong
+event_impl::get_profiling_info<info::event_profiling::command_end>() const {
+  if (!m_HostEvent) {
+    return get_event_profiling_info_cl<info::event_profiling::command_end>::_(
+        this->get());
+  }
+  assert(!"Not implemented for host device.");
+  return (cl_ulong)0;
+}
+
+template <> cl_uint event_impl::get_info<info::event::reference_count>() const {
+  if (!m_HostEvent) {
+    return get_event_info_cl<info::event::reference_count>::_(this->get());
+  }
+  assert(!"Not implemented for host device.");
+  return (cl_ulong)0;
+}
+
+template <>
+info::event_command_status
+event_impl::get_info<info::event::command_execution_status>() const {
+  if (!m_HostEvent) {
+    return get_event_info_cl<info::event::command_execution_status>::_(
+        this->get());
+  }
+  assert(!"Not implemented for host device.");
+  return info::event_command_status::complete;
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/force_device.cpp b/sycl/source/detail/force_device.cpp
new file mode 100644
index 000000000000..ebe1b3ffe555
--- /dev/null
+++ b/sycl/source/detail/force_device.cpp
@@ -0,0 +1,43 @@
+//==---------- force_device.cpp - Forcing SYCL device ----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/info/info_desc.hpp>
+#include <CL/sycl/stl.hpp>
+#include <cstdlib>
+#include "force_device.hpp"
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+bool match_types(const info::device_type &l, const info::device_type &r) {
+  return l == info::device_type::all || l == r || r == info::device_type::all;
+}
+
+info::device_type get_forced_type() {
+  if (const char *val = std::getenv("SYCL_DEVICE_TYPE")) {
+    if (string_class(val) == "CPU") {
+      return info::device_type::cpu;
+    }
+    if (string_class(val) == "GPU") {
+      return info::device_type::gpu;
+    }
+    if (string_class(val) == "ACC") {
+      return info::device_type::accelerator;
+    }
+    if (string_class(val) == "HOST") {
+      return info::device_type::host;
+    }
+  }
+  return info::device_type::all;
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/force_device.hpp b/sycl/source/detail/force_device.hpp
new file mode 100644
index 000000000000..d0a9da0a85ed
--- /dev/null
+++ b/sycl/source/detail/force_device.hpp
@@ -0,0 +1,24 @@
+//==---------- force_device.hpp - Forcing SYCL device ----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/info/info_desc.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+bool match_types(const info::device_type &l, const info::device_type &r);
+
+info::device_type get_forced_type();
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp
new file mode 100644
index 000000000000..a45ab182fd5e
--- /dev/null
+++ b/sycl/source/detail/helpers.cpp
@@ -0,0 +1,42 @@
+//==---------------- helpers.cpp - SYCL helpers ---------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/helpers.hpp>
+#include <CL/sycl/event.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+std::vector<cl_event>
+getOrWaitEvents(std::vector<cl::sycl::event> DepEvents,
+                cl::sycl::context Context) {
+  std::vector<cl_event> CLEvents;
+  for (auto SyclEvent : DepEvents) {
+    auto SyclEventImplPtr = detail::getSyclObjImpl(SyclEvent);
+    // TODO: Add check that contexts are equal.
+    if (SyclEventImplPtr->is_host()) {
+      SyclEventImplPtr->waitInternal();
+    } else {
+      CLEvents.push_back(SyclEventImplPtr->getHandleRef());
+    }
+  }
+  return CLEvents;
+}
+
+void waitEvents(std::vector<cl::sycl::event> DepEvents) {
+  for (auto SyclEvent : DepEvents) {
+    detail::getSyclObjImpl(SyclEvent)->waitInternal();
+  }
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/kernel_impl.cpp b/sycl/source/detail/kernel_impl.cpp
new file mode 100644
index 000000000000..621d2bf708d2
--- /dev/null
+++ b/sycl/source/detail/kernel_impl.cpp
@@ -0,0 +1,38 @@
+//==------- kernel_impl.cpp --- SYCL kernel implementation -----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/detail/kernel_impl.hpp>
+
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/program.hpp>
+#include <memory>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+kernel_impl::kernel_impl(cl_kernel ClKernel, const context &SyclContext)
+    : kernel_impl(ClKernel, SyclContext,
+                  std::make_shared<program_impl>(SyclContext, ClKernel)) {}
+
+program kernel_impl::get_program() const {
+  return createSyclObjFromImpl<program>(ProgramImpl);
+}
+
+template <> context kernel_impl::get_info<info::kernel::context>() const {
+  return get_context();
+}
+
+template <> program kernel_impl::get_info<info::kernel::program>() const {
+  return get_program();
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/kernel_info.cpp b/sycl/source/detail/kernel_info.cpp
new file mode 100644
index 000000000000..619e0f3a2540
--- /dev/null
+++ b/sycl/source/detail/kernel_info.cpp
@@ -0,0 +1,54 @@
+//==-------- kernel_info.cpp - SYCL kernel info methods --------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/detail/kernel_info.hpp>
+#include <CL/sycl/device.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+template <>
+cl::sycl::range<3>
+get_kernel_work_group_info_host<info::kernel_work_group::global_work_size>(
+    const cl::sycl::device &Dev) {
+  throw invalid_object_error("This instance of kernel is a host instance");
+}
+
+template <>
+size_t
+get_kernel_work_group_info_host<info::kernel_work_group::work_group_size>(
+    const cl::sycl::device &Dev) {
+  return Dev.get_info<info::device::max_work_group_size>();
+}
+
+template <>
+cl::sycl::range<3> get_kernel_work_group_info_host<
+    info::kernel_work_group::compile_work_group_size>(
+    const cl::sycl::device &Dev) {
+  return {0, 0, 0};
+}
+
+template <>
+size_t get_kernel_work_group_info_host<
+    info::kernel_work_group::preferred_work_group_size_multiple>(
+    const cl::sycl::device &Dev) {
+  return get_kernel_work_group_info_host<
+      info::kernel_work_group::work_group_size>(Dev);
+}
+
+template <>
+cl_ulong
+get_kernel_work_group_info_host<info::kernel_work_group::private_mem_size>(
+    const cl::sycl::device &Dev) {
+  return 0;
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/platform_host.cpp b/sycl/source/detail/platform_host.cpp
new file mode 100644
index 000000000000..fc29d8fd5adb
--- /dev/null
+++ b/sycl/source/detail/platform_host.cpp
@@ -0,0 +1,27 @@
+//==----------- platform_host.cpp -----------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/detail/platform_host.hpp>
+#include <CL/sycl/device.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+vector_class<device>
+platform_host::get_devices(info::device_type dev_type) const {
+  vector_class<device> res;
+  if (dev_type == info::device_type::host || dev_type == info::device_type::all)
+    res.resize(1); // default device construct creates host device
+  return res;
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/platform_info.cpp b/sycl/source/detail/platform_info.cpp
new file mode 100644
index 000000000000..d1f2dcaeac34
--- /dev/null
+++ b/sycl/source/detail/platform_info.cpp
@@ -0,0 +1,41 @@
+//==----------- platform_info.cpp -----------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/detail/platform_info.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+template <> string_class get_platform_info_host<info::platform::profile>() {
+  return "FULL PROFILE";
+}
+
+template <> string_class get_platform_info_host<info::platform::version>() {
+  return "1.2";
+}
+
+template <> string_class get_platform_info_host<info::platform::name>() {
+  return "SYCL host platform";
+}
+
+template <> string_class get_platform_info_host<info::platform::vendor>() {
+  return "";
+}
+
+template <>
+vector_class<string_class>
+get_platform_info_host<info::platform::extensions>() {
+  // TODO update when appropriate
+  return {};
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/platform_opencl.cpp b/sycl/source/detail/platform_opencl.cpp
new file mode 100644
index 000000000000..9b66cb13b831
--- /dev/null
+++ b/sycl/source/detail/platform_opencl.cpp
@@ -0,0 +1,43 @@
+//==----------- platform_opencl.cpp -----------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/platform_opencl.hpp>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/stl.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+vector_class<device>
+platform_opencl::get_devices(info::device_type deviceType) const {
+  vector_class<device> res;
+  if (deviceType == info::device_type::host)
+    return res;
+  cl_uint num_devices;
+  auto err = clGetDeviceIDs(id, (cl_device_type)deviceType, 0, 0, &num_devices);
+  if (err == CL_DEVICE_NOT_FOUND) {
+    return res;
+  }
+  // TODO catch an exception and put it to list of asynchronous exceptions
+  CHECK_OCL_CODE(err);
+  vector_class<cl_device_id> device_ids(num_devices);
+  // TODO catch an exception and put it to list of asynchronous exceptions
+  CHECK_OCL_CODE(clGetDeviceIDs(id, (cl_device_type)deviceType, num_devices,
+                                device_ids.data(), 0));
+  vector_class<device> devices =
+      vector_class<device>(device_ids.data(), device_ids.data() + num_devices);
+  res.insert(res.end(), devices.begin(), devices.end());
+  return res;
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp
new file mode 100644
index 000000000000..0d8e8e85f9ec
--- /dev/null
+++ b/sycl/source/detail/program_impl.cpp
@@ -0,0 +1,37 @@
+//==----- program_impl.cpp --- SYCL program implementation -----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/detail/program_impl.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+template <>
+cl_uint program_impl::get_info<info::program::reference_count>() const {
+  if (is_host()) {
+    throw invalid_object_error("This instance of program is a host instance");
+  }
+  cl_uint result;
+  clGetProgramInfo(ClProgram, CL_PROGRAM_REFERENCE_COUNT, sizeof(cl_uint),
+                   &result, nullptr);
+  return result;
+}
+
+template <> context program_impl::get_info<info::program::context>() const {
+  return get_context();
+}
+
+template <>
+vector_class<device> program_impl::get_info<info::program::devices>() const {
+  return get_devices();
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
new file mode 100644
index 000000000000..ff003a71adaa
--- /dev/null
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -0,0 +1,242 @@
+//==------ program_manager.cpp --- SYCL program manager---------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/program_manager/program_manager.hpp>
+#include <CL/sycl/exception.hpp>
+#include <CL/sycl/stl.hpp>
+
+#include <assert.h>
+#include <cstdlib>
+#include <fstream>
+#include <sstream>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+ProgramManager &ProgramManager::getInstance() {
+  // The singleton ProgramManager instance, uses the "magic static" idiom.
+  static ProgramManager Instance;
+  return Instance;
+}
+
+static cl_device_id getFirstDevice(cl_context Context) {
+  cl_uint NumDevices = 0;
+  cl_int Err = clGetContextInfo(Context, CL_CONTEXT_NUM_DEVICES,
+                                sizeof(NumDevices), &NumDevices,
+                                /*param_value_size_ret=*/ nullptr);
+  CHECK_OCL_CODE(Err);
+  assert(NumDevices > 0 && "Context without devices?");
+
+  vector_class<cl_device_id> Devices(NumDevices);
+  size_t ParamValueSize = 0;
+  Err = clGetContextInfo(Context, CL_CONTEXT_DEVICES,
+                         sizeof(cl_device_id) * NumDevices, &Devices[0],
+                         &ParamValueSize);
+  CHECK_OCL_CODE(Err);
+  assert(ParamValueSize == sizeof(cl_device_id) * NumDevices &&
+         "Number of CL_CONTEXT_DEVICES should match CL_CONTEXT_NUM_DEVICES.");
+  return Devices[0];
+}
+
+static cl_program createBinaryProgram(const cl_context Context,
+                                      const vector_class<char> &BinProg) {
+  // FIXME: we don't yet support multiple device binaries or multiple devices
+  // with a single binary.
+#ifndef _NDEBUG
+  cl_uint NumDevices = 0;
+  CHECK_OCL_CODE(clGetContextInfo(Context, CL_CONTEXT_NUM_DEVICES,
+                                  sizeof(NumDevices), &NumDevices,
+                                  /*param_value_size_ret=*/ nullptr));
+  assert(NumDevices > 0 &&
+         "Only a single device is supported for AOT compilation");
+#endif
+
+  cl_device_id Device = getFirstDevice(Context);
+  cl_int Err = CL_SUCCESS;
+  cl_int BinaryStatus = CL_SUCCESS;
+  size_t BinarySize = BinProg.size();
+  const unsigned char *Binary = (const unsigned char *) &BinProg[0];
+  cl_program Program =
+    clCreateProgramWithBinary(Context, 1, &Device, &BinarySize, &Binary,
+                              &BinaryStatus, &Err);
+  CHECK_OCL_CODE(Err);
+
+  return Program;
+}
+
+cl_program createSpirvProgram(const cl_context Context,
+                              const vector_class<char> &SpirvProg) {
+  cl_int Err = CL_SUCCESS;
+  cl_program ClProgram = clCreateProgramWithIL(Context, SpirvProg.data(),
+                                               SpirvProg.size(), &Err);
+  CHECK_OCL_CODE(Err);
+  return ClProgram;
+}
+
+static cl_program createProgram(cl_context Context,
+                                const vector_class<char> &DeviceProg) {
+  cl_program Program = nullptr;
+  int32_t SpirvMagic = 0;
+  const int32_t ValidSpirvMagic = 0x07230203;
+  if (DeviceProg.size() > sizeof(SpirvMagic)) {
+    std::copy(DeviceProg.begin(),
+              DeviceProg.begin() + sizeof(SpirvMagic),
+              (char*)&SpirvMagic);
+
+    if (SpirvMagic == ValidSpirvMagic) {
+      Program = createSpirvProgram(Context, DeviceProg);
+    }
+  }
+
+  // Program is not a SPIR-V, assume a device binary
+  if (!Program) {
+    Program = createBinaryProgram(Context, DeviceProg);
+  }
+
+  return Program;
+}
+
+cl_program ProgramManager::getBuiltOpenCLProgram(const context &Context) {
+  cl_program &ClProgram = m_CachedSpirvPrograms[Context];
+  if (!ClProgram) {
+    vector_class<char> DeviceProg = getSpirvSource();
+
+    cl_context ClContext = Context.get();
+    ClProgram = createProgram(ClContext, DeviceProg);
+    clReleaseContext(ClContext);
+
+    build(ClProgram);
+  }
+  return ClProgram;
+}
+
+cl_kernel ProgramManager::getOrCreateKernel(const context &Context,
+                                            const char *KernelName) {
+  cl_program Program = getBuiltOpenCLProgram(Context);
+  auto &KernelsCache = m_CachedKernels[Program];
+  cl_kernel &Kernel = KernelsCache[string_class(KernelName)];
+  if (!Kernel) {
+    cl_int Err = CL_SUCCESS;
+    Kernel = clCreateKernel(Program, KernelName, &Err);
+    CHECK_OCL_CODE(Err);
+  }
+  return Kernel;
+}
+
+cl_program ProgramManager::getClProgramFromClKernel(cl_kernel ClKernel) {
+  cl_program ClProgram;
+  CHECK_OCL_CODE(clGetKernelInfo(ClKernel, CL_KERNEL_PROGRAM,
+                                 sizeof(cl_program), &ClProgram, nullptr));
+  return ClProgram;
+}
+
+const vector_class<char> ProgramManager::getSpirvSource() {
+  // TODO FIXME make this function thread-safe
+  if (!m_SpirvSource) {
+    vector_class<char> *DataPtr = nullptr;
+
+    if (DeviceImages && !std::getenv("SYCL_USE_KERNEL_SPV")) {
+      assert(DeviceImages->NumDeviceImages == 1 &&
+             "only single image is supported for now");
+      const __tgt_device_image &Img = DeviceImages->DeviceImages[0];
+      auto *BegPtr = reinterpret_cast<const char *>(Img.ImageStart);
+      auto *EndPtr = reinterpret_cast<const char *>(Img.ImageEnd);
+      ptrdiff_t ImgSize = EndPtr - BegPtr;
+      DataPtr = new vector_class<char>(static_cast<size_t>(ImgSize));
+      // TODO this code is expected to be heavily refactored, this copying
+      // might be redundant (unless we don't want to work on live .rodata)
+      std::copy(BegPtr, EndPtr, DataPtr->begin());
+
+      if (std::getenv("SYCL_DUMP_IMAGES")) {
+        std::ofstream F("kernel.spv", std::ios::binary);
+
+        if (!F.is_open())
+          throw compile_program_error("Can not write kernel.spv\n");
+        F.write(BegPtr, ImgSize);
+        F.close();
+      }
+    } else {
+      std::ifstream File("kernel.spv", std::ios::binary);
+      if (!File.is_open()) {
+        throw compile_program_error("Can not open kernel.spv\n");
+      }
+      File.seekg(0, std::ios::end);
+      DataPtr = new vector_class<char>(File.tellg());
+      File.seekg(0);
+      File.read(DataPtr->data(), DataPtr->size());
+      File.close();
+    }
+    m_SpirvSource.reset(DataPtr);
+  }
+  // TODO makes unnecessary copy of the data
+  return *m_SpirvSource.get();
+}
+
+void ProgramManager::build(cl_program &ClProgram, const string_class &Options,
+                           std::vector<cl_device_id> ClDevices) {
+
+  const char *Opts = std::getenv("SYCL_PROGRAM_BUILD_OPTIONS");
+
+  if (!Opts)
+    Opts = Options.c_str();
+  if (clBuildProgram(ClProgram, ClDevices.size(), ClDevices.data(),
+                     Opts, nullptr, nullptr) == CL_SUCCESS)
+    return;
+
+  // Get OpenCL build log and add it to the exception message.
+  size_t Size = 0;
+  CHECK_OCL_CODE(
+      clGetProgramInfo(ClProgram, CL_PROGRAM_DEVICES, 0, nullptr, &Size));
+
+  std::vector<cl_device_id> DevIds(Size / sizeof(cl_device_id));
+  CHECK_OCL_CODE(clGetProgramInfo(ClProgram, CL_PROGRAM_DEVICES, Size,
+                                  DevIds.data(), nullptr));
+  std::string Log;
+  for (cl_device_id &DevId : DevIds) {
+    CHECK_OCL_CODE(clGetProgramBuildInfo(ClProgram, DevId, CL_PROGRAM_BUILD_LOG,
+                                         0, nullptr, &Size));
+    std::vector<char> BuildLog(Size);
+    CHECK_OCL_CODE(clGetProgramBuildInfo(ClProgram, DevId, CL_PROGRAM_BUILD_LOG,
+                                         Size, BuildLog.data(), nullptr));
+    device Dev(DevId);
+    Log += "\nBuild program fail log for '" +
+           Dev.get_info<info::device::name>() + "':\n" + BuildLog.data();
+  }
+  throw compile_program_error(Log.c_str());
+}
+
+bool ProgramManager::ContextLess::operator()(const context &LHS,
+                                             const context &RHS) const {
+  return std::hash<context>()(LHS) < std::hash<context>()(RHS);
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
+
+extern "C" void __tgt_register_lib(__tgt_bin_desc *desc) {
+  // TODO FIXME POC hacky implementation to replace the "kernel.spv" dirtier
+  // hack and enable separate compilation of device code.
+  // Major TODOs:
+  // - support (images for) multiple devices - depends on the native runtime
+  //   interface adoption
+  // - add synchronization to avoid races when multiple modules (.exe and .dlls)
+  //   try to do image registration at the same time
+  // - merge with program and kernel management infrastructure (requires more
+  //   design work)
+  cl::sycl::detail::ProgramManager::getInstance().setDeviceImages(desc);
+}
+
+// Executed as a part of current module's (.exe, .dll) static initialization
+extern "C" void __tgt_unregister_lib(__tgt_bin_desc *desc) {
+  // TODO implement the function
+}
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
new file mode 100644
index 000000000000..3c3d92656012
--- /dev/null
+++ b/sycl/source/detail/queue_impl.cpp
@@ -0,0 +1,33 @@
+//==------------------ queue_impl.cpp - SYCL queue -------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/queue_impl.hpp>
+#include <CL/sycl/device.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+template <> cl_uint queue_impl::get_info<info::queue::reference_count>() const {
+  cl_uint result = 0;
+  CHECK_OCL_CODE(clGetCommandQueueInfo(m_CommandQueue, CL_QUEUE_REFERENCE_COUNT,
+                                       sizeof(result), &result, nullptr));
+  return result;
+}
+
+template <> context queue_impl::get_info<info::queue::context>() const {
+  return get_context();
+}
+
+template <> device queue_impl::get_info<info::queue::device>() const {
+  return get_device();
+}
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
new file mode 100644
index 000000000000..581a2c83ef0e
--- /dev/null
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -0,0 +1,38 @@
+//==----------- commands.cpp -----------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl/detail/scheduler/commands.h>
+
+#include <atomic>
+#include <cassert>
+
+namespace cl {
+namespace sycl {
+namespace simple_scheduler {
+
+Command::Command(CommandType Type, QueueImplPtr Queue)
+    : m_Type(Type), m_Enqueued(false), m_Queue(std::move(Queue)) {
+  static std::atomic<size_t> CommandGlobalID(0);
+  m_ID = CommandGlobalID++;
+}
+
+void MemMoveCommand::enqueueImp(std::vector<cl::sycl::event> DepEvents,
+                                EventImplPtr Event) {
+  assert(nullptr != m_Buf && "Buf is nullptr");
+  m_Buf->moveMemoryTo(m_Queue, std::move(DepEvents), std::move(Event));
+}
+
+void AllocaCommand::enqueueImp(std::vector<cl::sycl::event> DepEvents,
+                               EventImplPtr Event) {
+  assert(nullptr != m_Buf && "Buf is nullptr");
+  m_Buf->allocate(m_Queue, std::move(DepEvents), std::move(Event));
+}
+
+} // namespace simple_scheduler
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/scheduler/printers.cpp b/sycl/source/detail/scheduler/printers.cpp
new file mode 100644
index 000000000000..bae08bed4204
--- /dev/null
+++ b/sycl/source/detail/scheduler/printers.cpp
@@ -0,0 +1,102 @@
+//==----------- printers.cpp -----------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/detail/queue_impl.hpp>
+#include <CL/sycl/detail/scheduler/commands.h>
+#include <CL/sycl/detail/scheduler/printers.cpp>
+#include <CL/sycl/device.hpp>
+
+#include <ostream>
+
+namespace cl {
+namespace sycl {
+namespace simple_scheduler {
+
+void MemMoveCommand::printDot(std::ostream &Stream) const {
+  cl::sycl::device DstDevice = m_Queue->get_device();
+  cl::sycl::device SrcDevice = m_SrcQueue->get_device();
+  const std::string ToDevColor = getColor(DstDevice, m_Buf->getTargetType());
+  std::string FromDevColor = getColor(SrcDevice);
+
+  Stream << "\"" << this << "\" [style=filled, gradientangle=90, label=\"";
+
+  Stream << "ID = " << getID() << " ; ";
+  Stream << "MOVE TO " << getDeviceTypeString(DstDevice) << "\\n";
+  Stream << "  Buf : " << m_Buf->getUniqID();
+  Stream << "  Access : " << accessMode2String(m_AccessMode) << "\\n";
+
+  Stream << "\", fillcolor=\"" << FromDevColor;
+  Stream << ";0.5:" << ToDevColor << "\"];" << std::endl;
+
+  for (const auto &Dep : m_Deps) {
+    const auto &Buf = Dep.second;
+    Stream << "\"" << this << "\" -> \"" << Dep.first << "\" [ label=\"";
+    Stream << accessMode2String(Buf->getAccessModeType()) << "\" ];";
+    Stream << std::endl;
+  }
+}
+
+void MemMoveCommand::print(std::ostream &Stream) const {
+  Stream << "ID = " << getID() << " ; ";
+  Stream << "MOVE TO " << getDeviceTypeString(m_Queue->get_device())
+         << std::endl;
+  Stream << "  Buf : " << m_Buf->getUniqID();
+  Stream << "  Access : " << accessMode2String(m_AccessMode) << std::endl;
+  Stream << "    Dependency:" << std::endl;
+
+  for (const auto &Dep : m_Deps) {
+    const auto &Command = Dep.first;
+    const auto &Buf = Dep.second;
+    Stream << "        Dep on buf " << Buf->getUniqID() << " ";
+    Stream << accessMode2String(Buf->getAccessModeType());
+    Stream << " from Command ID = " << Command->getID() << std::endl;
+  }
+}
+
+void AllocaCommand::printDot(std::ostream &Stream) const {
+
+  const std::string CommandColor = getColor(m_Queue->get_device());
+
+  Stream << "\"" << this << "\" [style=filled, label=\"";
+
+  Stream << "ID = " << getID() << " ; ";
+  Stream << "ALLOCA ON " << getDeviceTypeString(m_Queue->get_device()) << "\\n";
+  Stream << " Buf : " << m_Buf->getUniqID();
+  Stream << " Access : " << accessMode2String(m_AccessMode) << "\\n";
+
+  Stream << "\", fillcolor=\"" << CommandColor << "\"];" << std::endl;
+
+  for (const auto &Dep : m_Deps) {
+    const auto &Buf = Dep.second;
+    Stream << "  \"" << this << "\" -> \"" << Dep.first << "\" [ label=\"";
+    Stream << accessMode2String(Buf->getAccessModeType()) << "\" ];";
+    Stream << std::endl;
+  }
+}
+
+void AllocaCommand::print(std::ostream &Stream) const {
+  Stream << "ID = " << getID() << " ; ";
+  Stream << "ALLOCA ON " << getDeviceTypeString(m_Queue->get_device())
+         << std::endl;
+  Stream << "  Buf : " << m_Buf->getUniqID();
+  Stream << "  Access : " << accessMode2String(m_AccessMode) << std::endl;
+  Stream << "    Dependency:" << std::endl;
+
+  for (const auto &Dep : m_Deps) {
+    const auto &Command = Dep.first;
+    const auto &Buf = Dep.second;
+    Stream << "        Dep on buf " << Buf->getUniqID() << " ";
+    Stream << accessMode2String(Buf->getAccessModeType());
+    Stream << " from Command ID = " << Command->getID() << std::endl;
+  }
+}
+
+} // namespace simple_scheduler
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
new file mode 100644
index 000000000000..08321704c0ce
--- /dev/null
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -0,0 +1,188 @@
+//==----------- scheduler.cpp ----------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/buffer.hpp>
+#include <CL/sycl/detail/scheduler/commands.h>
+#include <CL/sycl/detail/scheduler/requirements.h>
+#include <CL/sycl/detail/scheduler/scheduler.h>
+#include <CL/sycl/event.hpp>
+#include <CL/sycl/nd_range.hpp>
+
+#include <cassert>
+#include <fstream>
+#include <set>
+#include <vector>
+
+namespace cl {
+namespace sycl {
+namespace simple_scheduler {
+
+Scheduler::Scheduler() {
+  if (std::getenv("SS_DUMP_TEXT")) {
+    m_DumpOptions[DumpOptions::Text] = 1;
+  }
+  if (std::getenv("SS_DUMP_WHOLE_GRAPH")) {
+    m_DumpOptions[DumpOptions::WholeGraph] = 1;
+  }
+  if (std::getenv("SS_DUMP_RUN_GRAPH")) {
+    m_DumpOptions[DumpOptions::RunGraph] = 1;
+  }
+}
+
+void Node::addInteropArg(shared_ptr_class<void> Ptr, size_t Size,
+                         int ArgIndex, BufferReqPtr BufReq) {
+  m_InteropArgs.emplace_back(Ptr, Size, ArgIndex, BufReq);
+}
+
+// Waits for the event passed.
+void Scheduler::waitForEvent(EventImplPtr Event) {
+  for (auto &BufEvolution : m_BuffersEvolution) {
+    for (auto &Cmd : BufEvolution.second) {
+      if (detail::getSyclObjImpl(Cmd->getEvent()) == Event) {
+        enqueueAndWaitForCommand(Cmd);
+        return;
+      }
+    }
+  }
+  for (auto &Evnt : m_EventsWithoutRequirements) {
+    if (Evnt == Event) {
+      Evnt->waitInternal();
+      return;
+    }
+  }
+}
+
+void Scheduler::print(std::ostream &Stream) const {
+  Stream << "======================================" << std::endl;
+  Stream << "Graph dump" << std::endl;
+  Stream << "======================================" << std::endl;
+
+  for (auto It : m_BuffersEvolution) {
+    Stream << std::endl;
+    Stream << "Evolution of buffer " << It.first->getUniqID() << std::endl;
+    for (auto Elem : It.second) {
+      Elem->print(Stream);
+    }
+  }
+}
+
+void Scheduler::printDot(std::ostream &Stream) const {
+  Stream << "strict digraph {" << std::endl;
+  for (auto It : m_BuffersEvolution) {
+    Stream << "label=\"" << It.first << "\"" << std::endl;
+    for (auto Elem : It.second) {
+      Elem->printDot(Stream);
+    }
+  }
+  Stream << "}" << std::endl;
+}
+
+void Scheduler::dumpGraphForCommand(CommandPtr Cmd) const {
+  std::string FileName = "graph_run" + std::to_string(Cmd->getID()) + ".dot";
+  std::fstream GraphDot(FileName, std::ios::out);
+  GraphDot << "strict digraph {" << std::endl;
+
+  printGraphForCommand(std::move(Cmd), GraphDot);
+
+  GraphDot << "}" << std::endl;
+}
+
+// Converts the following:
+//
+//  =========    =========     =========
+// | kernel1 |<-| kernel2 |<--| kernel3 |
+// | write A |  | read A  |   | read A  |
+//  =========    =========     =========
+//
+// to: ---------------------------
+//     \/                        |
+//  =========    =========     =========
+// | kernel1 |<-| kernel2 |   | kernel3 |
+// | write A |  | read A  |   | read A  |
+//  =========    =========     =========
+//
+void Scheduler::parallelReadOpt() {
+  for (auto BufEvolution : m_BuffersEvolution) {
+    auto &Buf = BufEvolution.first;
+    for (auto Node : BufEvolution.second) {
+      if (!Node->isEnqueued() && Node->getType() == Command::RUN_KERNEL &&
+          Node->getAccessModeForReqBuf(Buf) == cl::sycl::access::mode::read) {
+        CommandPtr Dep = Node->getDepCommandForReqBuf(Buf);
+        assert(nullptr != Dep);
+        if (Dep->getType() == Command::RUN_KERNEL &&
+            Dep->getAccessModeForReqBuf(Buf) == cl::sycl::access::mode::read) {
+          Node->replaceDepCommandForReqBuf(Buf,
+                                           Dep->getDepCommandForReqBuf(Buf));
+        }
+      }
+    }
+  }
+}
+
+Scheduler::~Scheduler() {
+  // TODO: Find a better way to break recursive shared_ptr desctruction.
+  // This is needed because in cases when there are a lot of commands and
+  // they depend on each other we can run out of stack memory because
+  // destruction of latest command involves destruction of all it's
+  // dependencies.
+  for (auto Evol : m_BuffersEvolution) {
+    for (auto Cmd : Evol.second) {
+      Cmd->removeAllDeps();
+    }
+  }
+}
+
+void Scheduler::enqueueAndWaitForCommand(CommandPtr Cmd) {
+  cl::sycl::event Event = EnqueueCommand(std::move(Cmd));
+  detail::getSyclObjImpl(Event)->waitInternal();
+}
+
+bool Scheduler::getDumpFlagValue(DumpOptions DumpOption) {
+  return m_DumpOptions[DumpOption];
+}
+
+// Enqueues Cmd command and all its dependencies.
+cl::sycl::event Scheduler::EnqueueCommand(CommandPtr Cmd) {
+  if (getDumpFlagValue(DumpOptions::Text)) {
+    dump();
+  }
+  if (getDumpFlagValue(DumpOptions::WholeGraph)) {
+    dumpGraph();
+  }
+  if (getDumpFlagValue(DumpOptions::RunGraph)) {
+    dumpGraphForCommand(Cmd);
+  }
+
+  return dispatch(std::move(Cmd));
+}
+
+cl::sycl::event Scheduler::dispatch(CommandPtr Cmd) {
+  if (Cmd->isEnqueued()) {
+    return Cmd->getEvent();
+  }
+  std::vector<cl::sycl::event> EventDeps;
+  for (auto Dep : Cmd->getDependencies()) {
+    EventDeps.push_back(dispatch(Dep.first));
+  }
+  return Cmd->enqueue(std::move(EventDeps));
+}
+
+// Recursively generates dot records for the command passed and all that the
+// command depends on.
+void Scheduler::printGraphForCommand(CommandPtr Cmd,
+                                     std::ostream &Stream) const {
+  for (const auto &Dep : Cmd->getDependencies()) {
+    printGraphForCommand(Dep.first, Stream);
+  }
+  Cmd->printDot(Stream);
+}
+
+} // namespace simple_scheduler
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp
new file mode 100644
index 000000000000..564a3fbfa261
--- /dev/null
+++ b/sycl/source/device.cpp
@@ -0,0 +1,54 @@
+//==------------------- device.cpp -----------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/detail/device_host.hpp>
+#include <CL/sycl/detail/device_opencl.hpp>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/device_selector.hpp>
+#include "detail/force_device.hpp"
+
+namespace cl {
+namespace sycl {
+namespace detail {
+void force_type(info::device_type &t, const info::device_type &ft) {
+  if (t == info::device_type::all) {
+    t = ft;
+  } else if (ft != info::device_type::all && t != ft) {
+    throw cl::sycl::invalid_parameter_error("No device of forced type.");
+  }
+}
+} // namespace detail
+
+device::device() : impl(std::make_shared<detail::device_host>()) {}
+
+device::device(cl_device_id deviceId)
+    : impl(std::make_shared<detail::device_opencl>(deviceId)) {}
+
+device::device(const device_selector &deviceSelector) {
+  *this = deviceSelector.select_device();
+}
+
+vector_class<device> device::get_devices(info::device_type deviceType) {
+  vector_class<device> devices;
+  info::device_type forced_type = detail::get_forced_type();
+  // Exclude devices which do not match requested device type
+  if (detail::match_types(deviceType, forced_type)) {
+    detail::force_type(deviceType, forced_type);
+    for (const auto &plt : platform::get_platforms()) {
+      vector_class<device> found_devices(plt.get_devices(deviceType));
+      if (!found_devices.empty())
+        devices.insert(devices.end(), found_devices.begin(),
+                       found_devices.end());
+    }
+  }
+  return devices;
+}
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/device_selector.cpp b/sycl/source/device_selector.cpp
new file mode 100644
index 000000000000..9df21450d95e
--- /dev/null
+++ b/sycl/source/device_selector.cpp
@@ -0,0 +1,68 @@
+//==------ device_selector.cpp - SYCL device selector ----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/device_selector.hpp>
+#include <CL/sycl/exception.hpp>
+#include <CL/sycl/stl.hpp>
+// 4.6.1 Device selection class
+
+namespace cl {
+namespace sycl {
+device device_selector::select_device() const {
+  vector_class<device> devices = device::get_devices();
+  int score = -1;
+  const device *res = nullptr;
+  for (const auto &dev : devices)
+    if (score < operator()(dev)) {
+      res = &dev;
+      score = operator()(dev);
+    }
+
+  if (res != nullptr)
+    return *res;
+
+  throw cl::sycl::invalid_parameter_error(
+      "No device of requested type available.");
+}
+
+int default_selector::operator()(const device &dev) const {
+  if (dev.is_gpu())
+    return 500;
+
+  if (dev.is_accelerator())
+    return 400;
+
+  if (dev.is_cpu())
+    return 300;
+
+  if (dev.is_host())
+    return 100;
+
+  return -1;
+}
+
+int gpu_selector::operator()(const device &dev) const {
+  return dev.is_gpu() ? 1000 : -1;
+}
+
+int cpu_selector::operator()(const device &dev) const {
+  return dev.is_cpu() ? 1000 : -1;
+}
+
+int accelerator_selector::operator()(const device &dev) const {
+  return dev.is_accelerator() ? 1000 : -1;
+}
+
+int host_selector::operator()(const device &dev) const {
+  return dev.is_host() ? 1000 : -1;
+}
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/event.cpp b/sycl/source/event.cpp
new file mode 100644
index 000000000000..83789ed35041
--- /dev/null
+++ b/sycl/source/event.cpp
@@ -0,0 +1,66 @@
+//==---------------- event.cpp --- SYCL event ------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/event_impl.hpp>
+#include <CL/sycl/event.hpp>
+#include <CL/sycl/stl.hpp>
+
+#include <memory>
+
+namespace cl {
+namespace sycl {
+
+event::event() : impl(std::make_shared<detail::event_impl>()) {}
+
+event::event(cl_event clEvent, const context &syclContext)
+    : impl(std::make_shared<detail::event_impl>(clEvent, syclContext)) {}
+
+bool event::operator==(const event &rhs) const { return rhs.impl == impl; }
+
+bool event::operator!=(const event &rhs) const { return !(*this == rhs); }
+
+cl_event event::get() { return impl->get(); }
+
+bool event::is_host() const { return impl->is_host(); }
+
+void event::wait() const { impl->wait(impl); }
+
+event::event(std::shared_ptr<detail::event_impl> event_impl)
+    : impl(event_impl) {}
+
+template <> cl_uint event::get_info<info::event::reference_count>() const {
+  return impl->get_info<info::event::reference_count>();
+}
+
+template <>
+info::event_command_status
+event::get_info<info::event::command_execution_status>() const {
+  return impl->get_info<info::event::command_execution_status>();
+}
+
+template <>
+cl_ulong
+event::get_profiling_info<info::event_profiling::command_submit>() const {
+  return impl->get_profiling_info<info::event_profiling::command_submit>();
+}
+template <>
+cl_ulong
+event::get_profiling_info<info::event_profiling::command_start>() const {
+  return impl->get_profiling_info<info::event_profiling::command_start>();
+}
+
+template <>
+cl_ulong event::get_profiling_info<info::event_profiling::command_end>() const {
+  return impl->get_profiling_info<info::event_profiling::command_end>();
+}
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/exception.cpp b/sycl/source/exception.cpp
new file mode 100644
index 000000000000..7827a9d0add0
--- /dev/null
+++ b/sycl/source/exception.cpp
@@ -0,0 +1,29 @@
+//==---------------- exception.cpp - SYCL exception ------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// 4.9.2 Exception Class Interface
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/exception.hpp>
+#include <exception>
+
+namespace cl {
+namespace sycl {
+
+bool exception::has_context() const { return (Context != nullptr); }
+
+context exception::get_context() const {
+  if (!has_context())
+    throw invalid_object_error();
+
+  return *Context;
+}
+
+cl_int exception::get_cl_code() const { return cl_err; }
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/kernel.cpp b/sycl/source/kernel.cpp
new file mode 100644
index 000000000000..0bf30b1a584f
--- /dev/null
+++ b/sycl/source/kernel.cpp
@@ -0,0 +1,20 @@
+//==--------------- kernel.cpp --- SYCL kernel -----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/kernel.hpp>
+
+#include <CL/sycl/program.hpp>
+
+namespace cl {
+namespace sycl {
+
+program kernel::get_program() const { return impl->get_program(); }
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/platform.cpp b/sycl/source/platform.cpp
new file mode 100644
index 000000000000..3d45c56d0512
--- /dev/null
+++ b/sycl/source/platform.cpp
@@ -0,0 +1,68 @@
+//==----------- platform.cpp -----------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/detail/platform_host.hpp>
+#include <CL/sycl/detail/platform_opencl.hpp>
+#include <CL/sycl/device.hpp>
+#include <CL/sycl/device_selector.hpp>
+#include <CL/sycl/platform.hpp>
+#include "detail/force_device.hpp"
+
+namespace cl {
+namespace sycl {
+
+platform::platform() : impl(std::make_shared<detail::platform_host>()) {}
+
+platform::platform(cl_platform_id platform_id)
+    : impl(std::make_shared<detail::platform_opencl>(platform_id)) {}
+
+platform::platform(const device_selector &dev_selector) {
+  *this = dev_selector.select_device().get_platform();
+}
+
+vector_class<device> platform::get_devices(info::device_type dev_type) const {
+  return impl->get_devices(dev_type);
+}
+
+vector_class<platform> platform::get_platforms() {
+  static vector_class<platform> platforms;
+
+  if (!platforms.empty()) {
+    return platforms;
+  }
+
+  cl_uint num_platforms = 0;
+  info::device_type forced_type = detail::get_forced_type();
+
+  auto error = clGetPlatformIDs(0, 0, &num_platforms);
+  if (error != CL_PLATFORM_NOT_FOUND_KHR)
+    CHECK_OCL_CODE(error); // Skip check if no OpenCL available
+  if (num_platforms) {
+    vector_class<cl_platform_id> platform_ids(num_platforms);
+    error = clGetPlatformIDs(num_platforms, platform_ids.data(), 0);
+    CHECK_OCL_CODE(error);
+
+    for (cl_uint i = 0; i < num_platforms; i++) {
+      platform plt(platform_ids[i]);
+
+      // Skip platforms which do not contain requested device types
+      if (!plt.get_devices(forced_type).empty())
+        platforms.push_back(plt);
+    }
+  }
+
+  // Add host device platform if required
+  if (detail::match_types(forced_type, info::device_type::host))
+    platforms.push_back(platform());
+
+  return platforms;
+}
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
new file mode 100644
index 000000000000..6f9fa01bcda7
--- /dev/null
+++ b/sycl/source/queue.cpp
@@ -0,0 +1,40 @@
+//==-------------- queue.cpp -----------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/queue.hpp>
+#include <algorithm>
+namespace cl {
+namespace sycl {
+queue::queue(const context &syclContext, const device_selector &deviceSelector,
+             const async_handler &asyncHandler, const property_list &propList) {
+
+  const vector_class<device> Devs = syclContext.get_devices();
+
+  auto Comp = [&deviceSelector](const device &d1, const device &d2) {
+    return deviceSelector(d1) < deviceSelector(d2);
+  };
+
+  *this = queue(*std::max_element(Devs.begin(), Devs.end(), Comp), asyncHandler,
+                propList);
+}
+
+queue::queue(const device &syclDevice, const async_handler &asyncHandler,
+             const property_list &propList) {
+  impl =
+      std::make_shared<detail::queue_impl>(syclDevice, asyncHandler, propList);
+}
+
+queue::queue(cl_command_queue clQueue, const context &syclContext,
+             const async_handler &asyncHandler) {
+  impl =
+      std::make_shared<detail::queue_impl>(clQueue, syclContext, asyncHandler);
+}
+
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/spirv_ops.cpp b/sycl/source/spirv_ops.cpp
new file mode 100644
index 000000000000..640e85139144
--- /dev/null
+++ b/sycl/source/spirv_ops.cpp
@@ -0,0 +1,39 @@
+//===------------- spirv_ops.cpp - SPIRV operations -----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/__spirv/spirv_ops.hpp>
+#include <CL/sycl/exception.hpp>
+#include <atomic>
+
+namespace cl {
+namespace __spirv {
+
+// This operation is NOP on HOST as all operations there are blocking and
+// by the moment this function was called, the operations generating
+// the OpTypeEvent objects had already been finished.
+void OpGroupWaitEvents(int32_t Scope, uint32_t NumEvents,
+                              OpTypeEvent ** WaitEvents) noexcept {
+}
+
+void OpControlBarrier(Scope Execution, Scope Memory,
+                      uint32_t Semantics) noexcept {
+  throw cl::sycl::runtime_error(
+      "Barrier is not supported on the host device yet.");
+}
+
+void OpMemoryBarrier(Scope Memory, uint32_t Semantics) noexcept {
+  // 1. The 'Memory' parameter is ignored on HOST because there is no memory
+  //    separation to global and local there.
+  // 2. The 'Semantics' parameter is ignored because there is no need
+  //    to distinguish the classes of memory (workgroup/cross-workgroup/etc).
+  atomic_thread_fence(std::memory_order_seq_cst);
+}
+
+} // namespace __spirv
+} // namespace cl
diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt
new file mode 100644
index 000000000000..c8ab70562293
--- /dev/null
+++ b/sycl/test/CMakeLists.txt
@@ -0,0 +1,26 @@
+set(SYCL_TESTS_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(RT_TEST_ARGS ${RT_TEST_ARGS} "-v")
+
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
+  MAIN_CONFIG
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg
+  )
+
+set(SYCL_TEST_DEPS
+  llvm-spirv
+  llvm-link
+  llc
+  clang
+  sycl-headers
+  get_device_count_by_type
+  FileCheck
+  ${SYCLLibrary}
+  )
+add_lit_testsuite(check-sycl "Running the Sycl regression tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ARGS ${RT_TEST_ARGS}
+  DEPENDS ${SYCL_TEST_DEPS}
+  )
+set_target_properties(check-sycl PROPERTIES FOLDER "Sycl tests")
diff --git a/sycl/test/aot/with-llvm-bc.cpp b/sycl/test/aot/with-llvm-bc.cpp
new file mode 100644
index 000000000000..e2994eab5172
--- /dev/null
+++ b/sycl/test/aot/with-llvm-bc.cpp
@@ -0,0 +1,72 @@
+// RUN: %clang -fsycl -fsycl-targets=spir64-unknown-linux-sycldevice -c %s -o %t.o
+// RUN: %clang -fsycl -fsycl-link-targets=spir64-unknown-linux-sycldevice %t.o -o %t.spv
+// RUN: llvm-spirv -r %t.spv -o %t.bc
+// RUN: %clang -fsycl -fsycl-add-targets=binary:%t.bc %t.o -o %t.out -lOpenCL -lsycl -lstdc++
+//
+// Only CPU supports LLVM IR bitcode as a binary
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+
+//==----- with-llvm-bc.cpp - SYCL kernel with LLVM IR bitcode as binary ----==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+#include <array>
+#include <iostream>
+
+constexpr cl::sycl::access::mode sycl_read = cl::sycl::access::mode::read;
+constexpr cl::sycl::access::mode sycl_write = cl::sycl::access::mode::write;
+
+template <typename T>
+class SimpleVadd;
+
+template <typename T, size_t N>
+void simple_vadd(const std::array<T, N>& VA, const std::array<T, N>& VB,
+                 std::array<T, N>& VC) {
+  cl::sycl::queue deviceQueue;
+  cl::sycl::range<1> numOfItems{N};
+  cl::sycl::buffer<T, 1> bufferA(VA.data(), numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(VB.data(), numOfItems);
+  cl::sycl::buffer<T, 1> bufferC(VC.data(), numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler& cgh) {
+    auto accessorA = bufferA.template get_access<sycl_read>(cgh);
+    auto accessorB = bufferB.template get_access<sycl_read>(cgh);
+    auto accessorC = bufferC.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class SimpleVadd<T>>(numOfItems,
+    [=](cl::sycl::id<1> wiID) {
+        accessorC[wiID] = accessorA[wiID] + accessorB[wiID];
+    });
+  });
+}
+
+int main() {
+  const size_t array_size = 4;
+  std::array<cl::sycl::cl_int, array_size> A = {{1, 2, 3, 4}},
+                                           B = {{1, 2, 3, 4}}, C;
+  std::array<cl::sycl::cl_float, array_size> D = {{1.f, 2.f, 3.f, 4.f}},
+                                             E = {{1.f, 2.f, 3.f, 4.f}}, F;
+  simple_vadd(A, B, C);
+  simple_vadd(D, E, F);
+  for (unsigned int i = 0; i < array_size; i++) {
+    if (C[i] != A[i] + B[i]) {
+      std::cout << "The results are incorrect (element " << i << " is " << C[i]
+                << "!\n";
+      return 1;
+    }
+    if (F[i] != D[i] + E[i]) {
+      std::cout << "The results are incorrect (element " << i << " is " << F[i]
+                << "!\n";
+      return 1;
+    }
+  }
+  std::cout << "The results are correct!\n";
+  return 0;
+}
diff --git a/sycl/test/basic_tests/accessor/accessor.cpp b/sycl/test/basic_tests/accessor/accessor.cpp
new file mode 100644
index 000000000000..269b5c20f2f4
--- /dev/null
+++ b/sycl/test/basic_tests/accessor/accessor.cpp
@@ -0,0 +1,178 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==----------------accessor.cpp - SYCL accessor basic test ----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <cassert>
+
+namespace sycl {
+using namespace cl::sycl;
+}
+
+struct IdxID1 {
+  int x;
+
+  IdxID1(int x) : x(x) {}
+  operator sycl::id<1>() { return x; }
+};
+
+struct IdxID3 {
+  int x;
+  int y;
+  int z;
+
+  IdxID3(int x, int y, int z) : x(x), y(y), z(z) {}
+  operator sycl::id<3>() { return sycl::id<3>(x, y, z); }
+};
+
+struct IdxSzT {
+  int x;
+
+  IdxSzT(int x) : x(x) {}
+  operator size_t() { return x; }
+};
+
+int main() {
+  // Host accessor.
+  {
+    int src[2] = {3, 7};
+    int dst[2];
+
+    sycl::buffer<int, 1> buf_src(src, sycl::range<1>(2),
+                                 {cl::sycl::property::buffer::use_host_ptr()});
+    sycl::buffer<int, 1> buf_dst(dst, sycl::range<1>(2),
+                                 {cl::sycl::property::buffer::use_host_ptr()});
+
+    sycl::id<1> id1(1);
+    auto acc_src = buf_src.get_access<sycl::access::mode::read>();
+    auto acc_dst = buf_dst.get_access<sycl::access::mode::read_write>();
+
+    assert(!acc_src.is_placeholder());
+    assert(acc_src.get_size() == sizeof(src));
+    assert(acc_src.get_count() == 2);
+    assert(acc_src.get_range() == sycl::range<1>(2));
+    assert(acc_src.get_pointer() == src);
+
+    // Make sure that operator[] is defined for both size_t and id<1>.
+    // Implicit conversion from IdxSzT to size_t guarantees that no
+    // implicit conversion from size_t to id<1> will happen.
+    assert(acc_src[IdxSzT(0)] + acc_src[IdxID1(1)] == 10);
+
+    acc_dst[0] = acc_src[0] + acc_src[IdxID1(0)];
+    acc_dst[id1] = acc_src[1] + acc_src[IdxSzT(1)];
+    assert(dst[0] == 6 && dst[1] == 14);
+  }
+
+  // Three-dimensional host accessor.
+  {
+    int data[24];
+    for (int i = 0; i < 24; ++i)
+      data[i] = i;
+    {
+      sycl::buffer<int, 3> buf(data, sycl::range<3>(2, 3, 4));
+      auto acc = buf.get_access<sycl::access::mode::read_write>();
+
+      assert(!acc.is_placeholder());
+      assert(acc.get_size() == sizeof(data));
+      assert(acc.get_count() == 24);
+      assert(acc.get_range() == sycl::range<3>(2, 3, 4));
+      assert(acc.get_pointer() != data);
+
+      for (int i = 0; i < 2; ++i)
+        for (int j = 0; j < 3; ++j)
+          for (int k = 0; k < 4; ++k)
+            acc[IdxID3(i, j, k)] += acc[sycl::id<3>(i, j, k)];
+    }
+    for (int i = 0; i < 24; ++i) {
+      assert(data[i] == 2 * i);
+    }
+  }
+  int data = 5;
+  // Device accessor.
+  {
+    sycl::queue Queue;
+
+    sycl::buffer<int, 1> buf(&data, sycl::range<1>(1),
+                             {cl::sycl::property::buffer::use_host_ptr()});
+
+    Queue.submit([&](sycl::handler &cgh) {
+      auto acc = buf.get_access<sycl::access::mode::read_write>(cgh);
+      assert(!acc.is_placeholder());
+      assert(acc.get_size() == sizeof(int));
+      assert(acc.get_count() == 1);
+      assert(acc.get_range() == sycl::range<1>(1));
+      cgh.single_task<class kernel>(
+          [=]() { acc[IdxSzT(0)] += acc[IdxID1(0)]; });
+    });
+    Queue.wait();
+  }
+  assert(data == 10);
+
+  // Device accessor with 2-dimensional subscript operators.
+  {
+    sycl::queue Queue;
+    if (!Queue.is_host()) {
+      int array[2][3] = {0};
+      {
+        sycl::range<2> Range(2, 3);
+        sycl::buffer<int, 2> buf((int *)array, Range,
+                                 {cl::sycl::property::buffer::use_host_ptr()});
+
+        Queue.submit([&](sycl::handler &cgh) {
+          auto acc = buf.get_access<sycl::access::mode::read_write>(cgh);
+          cgh.parallel_for<class dim2_subscr>(Range, [=](sycl::item<2> itemID) {
+            acc[itemID.get_id(0)][itemID.get_id(1)] += itemID.get_linear_id();
+          });
+        });
+        Queue.wait();
+      }
+      for (int i = 0; i < 2; i++) {
+        for (int j = 0; j < 3; j++) {
+          std::cout << "array[" << i << "][" << j << "]=" << array[i][j]
+                    << std::endl;
+          assert(array[i][j] == i * 3 + j);
+        }
+      }
+    }
+  }
+
+  // Device accessor with 3-dimensional subscript operators.
+  {
+    sycl::queue Queue;
+    if (!Queue.is_host()) {
+      int array[2][3][4] = {0};
+      {
+        sycl::range<3> Range(2, 3, 4);
+        sycl::buffer<int, 3> buf((int *)array, Range,
+                                 {cl::sycl::property::buffer::use_host_ptr()});
+
+        Queue.submit([&](sycl::handler &cgh) {
+          auto acc = buf.get_access<sycl::access::mode::read_write>(cgh);
+          cgh.parallel_for<class dim3_subscr>(Range, [=](sycl::item<3> itemID) {
+            acc[itemID.get_id(0)][itemID.get_id(1)][itemID.get_id(2)] +=
+                itemID.get_linear_id();
+          });
+        });
+        Queue.wait();
+      }
+      for (int i = 0; i < 2; i++) {
+        for (int j = 0; j < 3; j++) {
+          for (int k = 0; k < 4; k++) {
+            std::cout << "array[" << i << "][" << j << "][" << k
+                      << "]=" << array[i][j][k] << std::endl;
+            assert(array[i][j][k] == k + 4 * (j + 3 * i));
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/sycl/test/basic_tests/accessor_static_check.cpp b/sycl/test/basic_tests/accessor_static_check.cpp
new file mode 100644
index 000000000000..75780e3c4f04
--- /dev/null
+++ b/sycl/test/basic_tests/accessor_static_check.cpp
@@ -0,0 +1,100 @@
+// RUN: %clang -std=c++11 -fsyntax-only %s
+
+// Check that the test can be compiled with device compiler as well.
+// RUN: %clang --sycl -fsyntax-only %s
+//==--- accessor_static_check.cpp - Static checks for SYCL accessors -------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+
+namespace sycl {
+  using namespace cl::sycl;
+}
+
+struct SomeStructure {
+  char a;
+  float b;
+  union {
+    int x;
+    double y;
+  } v;
+};
+
+// Check that accessor_impl is the only data field in accessor class,
+// and that the accessor is a standard-layout structure. A pointer to
+// a standard-layout class may be converted (with reinterpret_cast) to
+// a pointer to its first non-static data member and vice versa.
+// Along the way, many specializations of accessor are instantiated.
+
+#define CHECK_ACCESSOR_SIZEOF(DataT, Dimensions, AccessMode, AccessTarget,     \
+                              IsPlaceholder)                                   \
+  static_assert(                                                               \
+      std::is_standard_layout<sycl::accessor<                                  \
+          DataT, Dimensions, AccessMode, AccessTarget, IsPlaceholder>>::value, \
+      "accessor is not a standard-layout structure");                          \
+  static_assert(                                                               \
+      sizeof(sycl::accessor<DataT, Dimensions, AccessMode, AccessTarget,       \
+                            IsPlaceholder>) ==                                 \
+          sizeof(sycl::detail::accessor_impl<                                  \
+                 typename sycl::detail::DeviceValueType<DataT,                 \
+                                                        AccessTarget>::type,   \
+                 Dimensions, AccessMode, AccessTarget, IsPlaceholder>),        \
+      "accessor_impl is not the only data field in accessor class");
+
+#define CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode, AccessTarget) \
+  CHECK_ACCESSOR_SIZEOF(DataT, Dimensions, AccessMode, AccessTarget,          \
+                        sycl::access::placeholder::true_t);                   \
+  CHECK_ACCESSOR_SIZEOF(DataT, Dimensions, AccessMode, AccessTarget,          \
+                        sycl::access::placeholder::false_t);
+
+#define CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions, AccessMode)    \
+  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
+                           sycl::access::target::global_buffer);   \
+  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
+                           sycl::access::target::constant_buffer); \
+  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
+                           sycl::access::target::local);           \
+  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
+                           sycl::access::target::host_buffer);
+
+#if 0
+// TODO:
+// The following checks should be enabled after the corresponding
+// access::targets are supported by DeviceValueType metafunction.
+  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
+                           sycl::access::target::image);           \
+  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
+                           sycl::access::target::host_image);      \
+  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
+                           sycl::access::target::image_array);
+#endif
+
+#define CHECK_ACCESSOR_SIZEOF_AM(DataT, Dimensions)                            \
+  CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions, sycl::access::mode::read);       \
+  CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions, sycl::access::mode::write);      \
+  CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions, sycl::access::mode::read_write); \
+  CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions,                                  \
+                           sycl::access::mode::discard_write);                 \
+  CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions,                                  \
+                           sycl::access::mode::discard_read_write);            \
+  CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions, sycl::access::mode::atomic);
+
+#define CHECK_ACCESSOR_SIZEOF_DIM(DataT) \
+  CHECK_ACCESSOR_SIZEOF_AM(DataT, 0); \
+  CHECK_ACCESSOR_SIZEOF_AM(DataT, 1); \
+  CHECK_ACCESSOR_SIZEOF_AM(DataT, 2); \
+  CHECK_ACCESSOR_SIZEOF_AM(DataT, 3);
+
+#define CHECK_ACCESSOR_SIZEOF_ALL \
+  CHECK_ACCESSOR_SIZEOF_DIM(char); \
+  CHECK_ACCESSOR_SIZEOF_DIM(unsigned); \
+  CHECK_ACCESSOR_SIZEOF_DIM(long long); \
+  CHECK_ACCESSOR_SIZEOF_DIM(double); \
+  CHECK_ACCESSOR_SIZEOF_DIM(SomeStructure);
+
+CHECK_ACCESSOR_SIZEOF_ALL
diff --git a/sycl/test/basic_tests/accessor_syntax_only.cpp b/sycl/test/basic_tests/accessor_syntax_only.cpp
new file mode 100644
index 000000000000..913879fbe1c6
--- /dev/null
+++ b/sycl/test/basic_tests/accessor_syntax_only.cpp
@@ -0,0 +1,198 @@
+//==--- accessor_syntax_only.cpp - Syntax checks for SYCL accessors --------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This test is supposed to check that interface of sycl::accessor
+// conforms to the specification. It checks that valid code can be
+// compiled and invalid code causes compilation errors.
+
+// RUN: %clang -std=c++11 -fsyntax-only -Xclang -verify %s
+
+// Check that the test an be compiled with device compiler as well.
+// RUN: %clang --sycl -fsyntax-only -Xclang -verify %s
+
+#include <CL/sycl.hpp>
+
+namespace sycl {
+  using namespace cl::sycl;
+  using namespace cl::sycl::access;
+}
+
+struct IdxSz {
+  operator size_t() { return 1; }
+};
+
+struct IdxId1 {
+  operator sycl::id<1>() { return sycl::id<1>(1); }
+};
+
+struct IdxId2 {
+  operator sycl::id<2>() { return sycl::id<2>(1, 1); }
+};
+
+struct IdxId3 {
+  operator sycl::id<3>() { return sycl::id<3>(1, 1, 1); }
+};
+
+struct IdxIdAny {
+  operator sycl::id<1>() { return sycl::id<1>(1); }
+  operator sycl::id<2>() { return sycl::id<2>(1, 1); }
+  operator sycl::id<3>() { return sycl::id<3>(1, 1, 1); }
+};
+
+struct IdxIdSz {
+  operator size_t() { return 1; }
+  operator sycl::id<1>() { return sycl::id<1>(1); }
+  operator sycl::id<2>() { return sycl::id<2>(1, 1); }
+  operator sycl::id<3>() { return sycl::id<3>(1, 1, 1); }
+};
+
+template <int dimensions, sycl::mode accessMode, sycl::target accessTarget>
+using acc_t = sycl::accessor<int, dimensions, accessMode, accessTarget,
+                             sycl::placeholder::false_t>;
+
+// Check that operator dataT is defined only if (dimensions == 0).
+void test1() {
+  int data = 5;
+  sycl::buffer<int, 1> buf(&data, 1);
+  auto acc = buf.get_access<sycl::access::mode::read>();
+  (int) acc; // expected-error {{cannot convert}}
+}
+
+// Check that operator dataT returns by value in case of read-accessor
+// and by reference in case of write-accessor.
+void test2(acc_t<0, sycl::mode::read, sycl::target::host_buffer> acc0,
+           acc_t<0, sycl::mode::write, sycl::target::global_buffer> acc1,
+           acc_t<0, sycl::mode::read_write, sycl::target::constant_buffer> acc2,
+           acc_t<0, sycl::mode::discard_write, sycl::target::local> acc3) {
+  int val0 = acc0;
+  int &val0_r = acc0; // expected-error {{cannot bind}}
+
+  int val1 = acc1;
+  int &val1_r = acc1;
+
+  int val2 = acc2;
+  int &val2_r = acc2;
+
+  int val3 = acc3;
+  int &val3_r = acc3;
+}
+
+// Check that operator[](size_t) is defined according to spec.
+void test3(acc_t<0, sycl::mode::discard_read_write, sycl::target::host_buffer> acc0,
+           acc_t<1, sycl::mode::write, sycl::target::global_buffer> acc1,
+           acc_t<2, sycl::mode::read, sycl::target::constant_buffer> acc2,
+           acc_t<3, sycl::mode::read_write, sycl::target::local> acc3) {
+  IdxSz idx;
+  acc0[idx]; // expected-error {{does not provide a subscript operator}}
+  acc1[idx];
+  acc1[idx] = 1;
+  acc2[idx][idx];
+  acc2[idx][idx] = 2; // expected-error {{expression is not assignable}}
+  acc3[idx][idx][idx];
+  acc3[idx][idx][idx] = 3;
+}
+
+// Check that operator[](id<n>) is not defined if (dimensions == 0 || dimensions != n).
+void test4(acc_t<0, sycl::mode::read_write, sycl::target::local> acc0,
+           acc_t<1, sycl::mode::read, sycl::target::host_buffer> acc1,
+           acc_t<2, sycl::mode::write, sycl::target::global_buffer> acc2,
+           acc_t<3, sycl::mode::discard_write, sycl::target::constant_buffer> acc3) {
+  IdxIdAny idx;
+  acc0[idx]; // expected-error {{does not provide a subscript operator}}
+  acc1[idx];
+  acc2[idx];
+  acc3[idx];
+
+  IdxId1 idx1;
+  IdxId2 idx2;
+  IdxId3 idx3;
+
+  acc1[idx1];
+  acc1[idx2]; // expected-error {{no viable overloaded operator[]}}
+  // expected-note@* {{no known conversion from 'IdxId2' to 'id<1>'}}
+  // expected-note@* {{no known conversion from 'IdxId2' to 'size_t'}}
+  acc1[idx3]; // expected-error {{no viable overloaded operator[]}}
+  // expected-note@* {{no known conversion from 'IdxId3' to 'id<1>'}}
+  // expected-note@* {{no known conversion from 'IdxId3' to 'size_t'}}
+
+  acc2[idx1]; // expected-error {{no viable overloaded operator[]}}
+  // expected-note@* {{no known conversion from 'IdxId1' to 'id<2>'}}
+  acc2[idx2];
+  acc2[idx3]; // expected-error {{no viable overloaded operator[]}}
+  // expected-note@* {{no known conversion from 'IdxId3' to 'id<2>'}}
+
+  acc3[idx1]; // expected-error {{no viable overloaded operator[]}}
+  // expected-note@* {{no known conversion from 'IdxId1' to 'id<3>'}}
+  acc3[idx2]; // expected-error {{no viable overloaded operator[]}}
+  // expected-note@* {{no known conversion from 'IdxId2' to 'id<3>'}}
+  acc3[idx3];
+}
+
+// Check that operator[] returns values by value if accessMode == mode::read,
+// and by reference otherwise.
+void test5(acc_t<1, sycl::mode::read, sycl::target::global_buffer> acc1,
+           acc_t<2, sycl::mode::write, sycl::target::host_buffer> acc2,
+           acc_t<3, sycl::mode::read_write, sycl::target::local> acc3) {
+  IdxIdAny idx;
+
+  int val1 = acc1[idx];
+  int &val1_r = acc1[idx]; // expected-error {{cannot bind}}
+
+  int val2 = acc2[idx];
+  int &val2_r = acc2[idx];
+
+  int val3 = acc3[idx];
+  int &val3_r = acc3[idx];
+}
+
+// Check get_pointer() method.
+void test6(acc_t<1, sycl::mode::read, sycl::target::host_buffer> acc1,
+           acc_t<2, sycl::mode::write, sycl::target::global_buffer> acc2,
+           acc_t<3, sycl::mode::read_write, sycl::target::constant_buffer> acc3) {
+  int *val = acc1.get_pointer();
+  acc2.get_pointer();
+  acc3.get_pointer();
+}
+
+// Check that there are two different versions of operator[] if
+// (dimensions == 1) and only one if (dimensions > 1).
+void test7(acc_t<1, sycl::mode::read_write, sycl::target::host_buffer> acc1,
+           acc_t<2, sycl::mode::write, sycl::target::global_buffer> acc2,
+           acc_t<3, sycl::mode::read, sycl::target::constant_buffer> acc3) {
+  IdxIdSz idx;
+  acc1[idx]; // expected-error {{use of overloaded operator '[]' is ambiguous}}
+  // expected-note@* {{candidate function}}
+  // expected-note@* {{candidate function}}
+  // expected-note@* {{candidate function}}
+  // expected-note@* {{candidate function}}
+  acc2[idx][idx]; // expected-error {{use of overloaded operator '[]' is ambiguous}}
+  // expected-note@* {{candidate function}}
+  // expected-note@* {{candidate function}}
+  // expected-note@* {{candidate function}}
+  acc3[idx][idx][idx]; // expected-error {{use of overloaded operator '[]' is ambiguous}}
+  // expected-note@* {{candidate function}}
+  // expected-note@* {{candidate function}}
+  // expected-note@* {{candidate function}}
+
+}
+
+// Check that there is no operator[] if (dimensions == 0).
+struct A {
+  int operator[](size_t x);
+};
+template <sycl::target Target>
+struct X : acc_t<0, sycl::mode::read, Target>, A {};
+void test8(X<sycl::target::host_buffer> acc1,
+           X<sycl::target::global_buffer> acc2,
+           X<sycl::target::constant_buffer> acc3,
+           X<sycl::target::local> acc4) {
+  acc1[42];
+  acc2[42];
+  acc3[42];
+  acc4[42];
+};
diff --git a/sycl/test/basic_tests/buffer/buffer.cpp b/sycl/test/basic_tests/buffer/buffer.cpp
new file mode 100644
index 000000000000..d1c0b85cb2f2
--- /dev/null
+++ b/sycl/test/basic_tests/buffer/buffer.cpp
@@ -0,0 +1,458 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==------------------- buffer.cpp - SYCL buffer basic test ----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <memory>
+
+using namespace cl::sycl;
+
+int main() {
+  int data = 5;
+  buffer<int, 1> buf(&data, range<1>(1));
+  {
+    int data1[10] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+    {
+      buffer<int, 1> b(data1, range<1>(10), {property::buffer::use_host_ptr()});
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::read_write>(cgh);
+        cgh.parallel_for<class init_a>(range<1>{10},
+                                       [=](id<1> index) { B[index] = 0; });
+      });
+
+    } // Data is copied back because there is a user side shared_ptr
+    for (int i = 0; i < 10; i++)
+      assert(data1[i] == 0);
+  }
+
+  {
+    std::vector<int> data1(10, -1);
+    {
+      buffer<int, 1> b(data1.data(), range<1>(10));
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::read_write>(cgh);
+        cgh.parallel_for<class init_b>(range<1>{10},
+                                       [=](id<1> index) { B[index] = 0; });
+      });
+
+    } // Data is copied back because there is a user side shared_ptr
+    for (int i = 0; i < 10; i++)
+      assert(data1[i] == 0);
+  }
+
+  {
+    const size_t bufsSize = 10;
+    vector_class<int> res(bufsSize);
+    shared_ptr_class<int> ptr1{new int[bufsSize], [](int *p) { delete[] p; }};
+    for (int *ptr = ptr1.get(), *end = ptr + bufsSize; ptr < end; ++ptr) {
+      *ptr = -1;
+    }
+    {
+      buffer<int, 1> b(ptr1, range<1>(bufsSize));
+      buffer<int, 1> c(ptr1, range<1>(bufsSize));
+      buffer<int, 1> d((range<1>(bufsSize)));
+      buffer<int, 1> e(res.data(), range<1>(bufsSize));
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::read_write>(cgh);
+        auto C = c.get_access<access::mode::read_write>(cgh);
+        auto D = d.get_access<access::mode::write>(cgh);
+        auto E = e.get_access<access::mode::write>(cgh);
+        cgh.parallel_for<class init_c>(range<1>{bufsSize}, [=](id<1> index) {
+          B[index]++;
+          C[index]++;
+          D[index] = C[index] + B[index] + 1;
+          E[index] = D[index] * (B[index] + 1) - 1;
+        });
+      });
+    } // Data is copied back because there is a user side shared_ptr
+    for (int i = 0; i < bufsSize; i++) {
+      assert(ptr1.get()[i] == 0);
+      assert(res[i] == 0);
+    }
+  }
+
+  {
+    std::cout << "move constructor" << std::endl;
+    int data = 5;
+    buffer<int, 1> Buffer(&data, range<1>(1));
+    size_t hash = std::hash<buffer<int, 1>>()(Buffer);
+    buffer<int, 1> MovedBuffer(std::move(Buffer));
+    assert(hash == (std::hash<buffer<int, 1>>()(MovedBuffer)));
+    assert(MovedBuffer.get_range() == range<1>(1));
+    assert(MovedBuffer.get_size() == (sizeof(data) * 1));
+    assert(MovedBuffer.get_count() == 1);
+  }
+
+  {
+    std::cout << "move assignment operator" << std::endl;
+    int data = 5;
+    buffer<int, 1> Buffer(&data, range<1>(1));
+    size_t hash = std::hash<buffer<int, 1>>()(Buffer);
+    int data_2 = 4;
+    buffer<int, 1> WillMovedBuffer(&data, range<1>(1));
+    WillMovedBuffer = std::move(Buffer);
+    assert(hash == (std::hash<buffer<int, 1>>()(WillMovedBuffer)));
+    assert(WillMovedBuffer.get_range() == range<1>(1));
+    assert(WillMovedBuffer.get_size() == (sizeof(data) * 1));
+    assert(WillMovedBuffer.get_count() == 1);
+  }
+
+  {
+    std::cout << "copy constructor" << std::endl;
+    int data = 5;
+    buffer<int, 1> Buffer(&data, range<1>(1));
+    size_t hash = std::hash<buffer<int, 1>>()(Buffer);
+    buffer<int, 1> BufferCopy(Buffer);
+    assert(hash == (std::hash<buffer<int, 1>>()(Buffer)));
+    assert(hash == (std::hash<buffer<int, 1>>()(BufferCopy)));
+    assert(Buffer == BufferCopy);
+    assert(BufferCopy.get_range() == range<1>(1));
+    assert(BufferCopy.get_size() == (sizeof(data) * 1));
+    assert(BufferCopy.get_count() == 1);
+  }
+
+  {
+    std::cout << "copy assignment operator" << std::endl;
+    int data = 5;
+    buffer<int, 1> Buffer(&data, range<1>(1));
+    size_t hash = std::hash<buffer<int, 1>>()(Buffer);
+    int data_2 = 4;
+    buffer<int, 1> WillBufferCopy(&data_2, range<1>(1));
+    WillBufferCopy = Buffer;
+    assert(hash == (std::hash<buffer<int, 1>>()(Buffer)));
+    assert(hash == (std::hash<buffer<int, 1>>()(WillBufferCopy)));
+    assert(Buffer == WillBufferCopy);
+    assert(WillBufferCopy.get_range() == range<1>(1));
+    assert(WillBufferCopy.get_size() == (sizeof(data) * 1));
+    assert(WillBufferCopy.get_count() == 1);
+  }
+
+  auto checkAllOf = [](const int *const array, size_t n, int v, int line) {
+    for (size_t i = 0; i < n; ++i) {
+      if (array[i] != v) {
+        std::cout << "line: " << line << " array[" << i << "] is " << array[i]
+                  << " expected " << v << std::endl;
+        assert(false);
+      }
+    }
+  };
+
+  {
+    int data[10] = {0};
+    int result[10] = {0};
+    {
+      buffer<int, 1> Buffer(data, range<1>(10));
+      Buffer.set_final_data(nullptr);
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = Buffer.get_access<access::mode::write>(cgh);
+        cgh.parallel_for<class Nullptr>(range<1>{10},
+                                        [=](id<1> index) { B[index] = 1; });
+      });
+    }
+    checkAllOf(result, 10, 0, __LINE__);
+  }
+  {
+    int data[10] = {0};
+    int result[10] = {0};
+    {
+      buffer<int, 1> Buffer(data, range<1>(10));
+      Buffer.set_final_data(result);
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = Buffer.get_access<access::mode::write>(cgh);
+        cgh.parallel_for<class rawPointer>(range<1>{10},
+                                           [=](id<1> index) { B[index] = 1; });
+      });
+    }
+    checkAllOf(result, 10, 1, __LINE__);
+  }
+  {
+    int data[10] = {0};
+    std::shared_ptr<int> result(new int[10]());
+    {
+      buffer<int, 1> Buffer(data, range<1>(10));
+      std::weak_ptr<int> resultWeak = result;
+      Buffer.set_final_data(resultWeak);
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = Buffer.get_access<access::mode::write>(cgh);
+        cgh.parallel_for<class sharedPointer>(
+            range<1>{10}, [=](id<1> index) { B[index] = 1; });
+      });
+    }
+    checkAllOf(result.get(), 10, 1, __LINE__);
+  }
+
+  {
+    int data[10] = {0};
+    int result[10] = {0};
+    // Creation of shared_ptr from a static array
+    // It's need for for check that a copyback did not happen
+    std::shared_ptr<int> resultShared(result, [](int const *) {});
+    {
+      buffer<int, 1> Buffer(data, range<1>(10));
+      std::weak_ptr<int> resultWeak = resultShared;
+      Buffer.set_final_data(resultWeak);
+      queue myQueue;
+      resultShared.reset();
+      myQueue.submit([&](handler &cgh) {
+        auto B = Buffer.get_access<access::mode::write>(cgh);
+        cgh.parallel_for<class sharedPointerAndReset>(
+            range<1>{10}, [=](id<1> index) { B[index] = 1; });
+      });
+    }
+    assert(resultShared.get() == nullptr);
+    checkAllOf(result, 10, 0, __LINE__);
+  }
+  {
+    int data[10] = {0};
+    std::vector<int> result(10, 0);
+    {
+      buffer<int, 1> Buffer(data, range<1>(10));
+      Buffer.set_final_data(result.begin());
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = Buffer.get_access<access::mode::write>(cgh);
+        cgh.parallel_for<class vectorIterator>(
+            range<1>{10}, [=](id<1> index) { B[index] = 1; });
+      });
+    }
+    checkAllOf(result.data(), 10, 1, __LINE__);
+  }
+  {
+    int data[10] = {0};
+    int result[10] = {0};
+    {
+      buffer<int, 1> Buffer(data, range<1>(10),
+                            {property::buffer::use_host_ptr()});
+      Buffer.set_final_data(nullptr);
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = Buffer.get_access<access::mode::write>(cgh);
+        cgh.parallel_for<class nullptAndUseHost>(
+            range<1>{10}, [=](id<1> index) { B[index] = 1; });
+      });
+    }
+    checkAllOf(result, 10, 0, __LINE__);
+  }
+  {
+    int data[10] = {0};
+    int result[10] = {0};
+    {
+      buffer<int, 1> Buffer(data, range<1>(10),
+                            {property::buffer::use_host_ptr()});
+      Buffer.set_final_data(result);
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = Buffer.get_access<access::mode::write>(cgh);
+        cgh.parallel_for<class rawPointerAndUseHost>(
+            range<1>{10}, [=](id<1> index) { B[index] = 1; });
+      });
+    }
+    checkAllOf(result, 10, 1, __LINE__);
+  }
+  {
+    int data[10] = {0};
+    std::shared_ptr<int> result(new int[10]());
+    {
+      buffer<int, 1> Buffer(data, range<1>(10),
+                            {property::buffer::use_host_ptr()});
+      std::weak_ptr<int> resultWeak = result;
+      Buffer.set_final_data(resultWeak);
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = Buffer.get_access<access::mode::write>(cgh);
+        cgh.parallel_for<class sharedPointerUseHost>(
+            range<1>{10}, [=](id<1> index) { B[index] = 1; });
+      });
+    }
+    checkAllOf(result.get(), 10, 1, __LINE__);
+  }
+  {
+    int data[10] = {0};
+    int result[10] = {0};
+    // Creation of shared_ptr from a static array
+    // It's need for for check that a copyback did not happen
+    std::shared_ptr<int> resultShared(result, [](int const *) {});
+    {
+      buffer<int, 1> Buffer(data, range<1>(10),
+                            {property::buffer::use_host_ptr()});
+      std::weak_ptr<int> resultWeak = resultShared;
+      Buffer.set_final_data(resultWeak);
+      queue myQueue;
+      resultShared.reset();
+      myQueue.submit([&](handler &cgh) {
+        auto B = Buffer.get_access<access::mode::write>(cgh);
+        cgh.parallel_for<class sharedPointerAndResetUseHost>(
+            range<1>{10}, [=](id<1> index) { B[index] = 1; });
+      });
+    }
+    assert(resultShared.get() == nullptr);
+    checkAllOf(result, 10, 0, __LINE__);
+  }
+  {
+    int data[10] = {0};
+    std::vector<int> result(10, 0);
+    {
+      buffer<int, 1> Buffer(data, range<1>(10),
+                            {property::buffer::use_host_ptr()});
+      Buffer.set_final_data(result.begin());
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = Buffer.get_access<access::mode::write>(cgh);
+        cgh.parallel_for<class vectorIteratorAndUseHost>(
+            range<1>{10}, [=](id<1> index) { B[index] = 1; });
+      });
+    }
+    checkAllOf(result.data(), 10, 1, __LINE__);
+  }
+
+  {
+    int result[20][20] = {0};
+    {
+      buffer<int, 2> Buffer(range<2>(20, 20));
+      Buffer.set_final_data((int *)result);
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = Buffer.get_access<access::mode::write>(cgh);
+        cgh.parallel_for<class bufferByRange2>(
+            range<2>{10, 10}, [=](id<2> index) { B[index] = 1; });
+      });
+    }
+
+    for (size_t i = 0; i < 20; ++i) {
+      for (size_t j = 0; j < 20; ++j) {
+        if (i < 10 && j < 10) {
+          if (result[i][j] != 1) {
+            std::cout << "line: " << __LINE__ << " result[" << i << "][" << j
+                      << "] is " << result[i][j] << " expected " << 1
+                      << std::endl;
+            assert(false);
+          }
+        } else {
+          if (result[i][j] != 0) {
+            std::cout << "line: " << __LINE__ << " result[" << i << "][" << j
+                      << "] is " << result[i][j] << " expected " << 0
+                      << std::endl;
+            assert(false);
+          }
+        }
+      }
+    }
+  }
+
+  {
+    int result[20][20] = {0};
+    {
+      buffer<int, 2> Buffer(range<2>(20, 20));
+      Buffer.set_final_data((int *)result);
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        accessor<int, 2, access::mode::write, access::target::global_buffer,
+               access::placeholder::false_t>
+          B(Buffer, cgh, range<2>(20,20), id<2>(10,10));
+        cgh.parallel_for<class bufferByRangeOffset>(
+            range<2>{10, 5}, [=](id<2> index) { B[index] = 1; });
+      });
+    }
+
+    for (size_t i = 0; i < 20; ++i) {
+      for (size_t j = 0; j < 20; ++j) {
+        if (i >= 10 && j >= 10 && j < 15) {
+          if (result[i][j] != 1) {
+            std::cout << "line: " << __LINE__ << " result[" << i << "][" << j
+                      << "] is " << result[i][j] << " expected " << 1
+                      << std::endl;
+            assert(false);
+          }
+        } else {
+          if (result[i][j] != 0) {
+            std::cout << "line: " << __LINE__ << " result[" << i << "][" << j
+                      << "] is " << result[i][j] << " expected " << 0
+                      << std::endl;
+            assert(false);
+          }
+        }
+      }
+    }
+  }
+
+  {
+    std::vector<int> data1(10, -1);
+    {
+      buffer<int, 1> b(data1.begin() + 2, data1.begin() + 5);
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::read_write>(cgh);
+        cgh.parallel_for<class iter_constuctor>(
+            range<1>{3}, [=](id<1> index) { B[index] = 20; });
+      });
+    }
+    // Data is not copied back in the desctruction of the buffer created from
+    // pair of iterators
+    for (int i = 0; i < 10; i++)
+      assert(data1[i] == -1);
+  }
+
+  // Try use_host_pointer for the buffer created from
+  {
+    std::vector<int> data1(10, -1);
+    {
+      buffer<int, 1> b(data1.begin() + 2, data1.begin() + 5,
+                       {property::buffer::use_host_ptr()});
+      b.set_final_data(data1.begin() + 2);
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::read_write>(cgh);
+        cgh.parallel_for<class iter_constuctor_use_host_ptr>(
+            range<1>{3}, [=](id<1> index) { B[index] = 20; });
+      });
+    }
+    for (int i = 0; i < 2; i++)
+      assert(data1[i] == -1);
+    for (int i = 2; i < 5; i++)
+      assert(data1[i] == 20);
+    for (int i = 5; i < 10; i++)
+      assert(data1[i] == -1);
+  }
+
+  // Check that data is copied back when using set_final_data for the buffer
+  // created from pair of iterators
+  {
+    std::vector<int> data1(10, -1);
+    {
+      buffer<int, 1> b(data1.begin() + 2, data1.begin() + 5);
+      b.set_final_data(data1.begin() + 2);
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::read_write>(cgh);
+        cgh.parallel_for<class iter_constuctor_set_final_data>(
+            range<1>{3}, [=](id<1> index) { B[index] = 20; });
+      });
+    }
+    // Data is not copied back in the desctruction of the buffer created from
+    // pair of iterators
+    for (int i = 0; i < 2; i++)
+      assert(data1[i] == -1);
+    for (int i = 2; i < 5; i++)
+      assert(data1[i] == 20);
+    for (int i = 5; i < 10; i++)
+      assert(data1[i] == -1);
+  }
+  // TODO tests with mutex property
+}
diff --git a/sycl/test/basic_tests/buffer/buffer_interop.cpp b/sycl/test/basic_tests/buffer/buffer_interop.cpp
new file mode 100644
index 000000000000..8d83fafe9f4a
--- /dev/null
+++ b/sycl/test/basic_tests/buffer/buffer_interop.cpp
@@ -0,0 +1,67 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==------------------- buffer.cpp - SYCL buffer basic test ----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <memory>
+
+using namespace cl::sycl;
+
+int main() {
+  {
+    const size_t Size = 32;
+    int Init[Size] = {5};
+    cl_int Error = CL_SUCCESS;
+
+    queue MyQueue;
+
+    cl_mem OpenCLBuffer = clCreateBuffer(
+        MyQueue.get_context().get(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+        Size * sizeof(int), Init, &Error);
+    CHECK_OCL_CODE(Error);
+    buffer<int, 1> Buffer(OpenCLBuffer, MyQueue.get_context());
+
+    MyQueue.submit([&](handler &CGH) {
+      auto B = Buffer.get_access<access::mode::write>(CGH);
+      CGH.parallel_for<class BufferInterop>(
+          range<1>{Size}, [=](id<1> Index) { B[Index] = 10; });
+    });
+
+    int Data[Size] = {10};
+    std::vector<int> Result(Size, 0);
+    {
+      buffer<int, 1> BufferData(Data, range<1>(Size),
+                                {property::buffer::use_host_ptr()});
+      buffer<int, 1> BufferCL(OpenCLBuffer, MyQueue.get_context());
+      BufferData.set_final_data(Result.begin());
+      MyQueue.submit([&](handler &CGH) {
+        auto Data = BufferData.get_access<access::mode::write>(CGH);
+        auto CLData = BufferCL.get_access<access::mode::read>(CGH);
+        CGH.parallel_for<class UseMemContent>(range<1>{Size}, [=](id<1> Index) {
+          Data[Index] = 2 * CLData[Index];
+        });
+      });
+    }
+
+    Error = clReleaseMemObject(OpenCLBuffer);
+    CHECK_OCL_CODE(Error);
+
+    for (size_t i = 0; i < Size; ++i) {
+      if (Result[i] != 20) {
+        std::cout << " array[" << i << "] is " << Result[i] << " expected "
+                  << 20 << std::endl;
+        assert(false);
+      }
+    }
+  }
+  return 0;
+}
diff --git a/sycl/test/basic_tests/context.cpp b/sycl/test/basic_tests/context.cpp
new file mode 100644
index 000000000000..899104d76edc
--- /dev/null
+++ b/sycl/test/basic_tests/context.cpp
@@ -0,0 +1,72 @@
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+
+//==--------------- context.cpp - SYCL context test ------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <iostream>
+
+using namespace cl::sycl;
+
+int main() {
+  try {
+    context c;
+  } catch (device_error e) {
+    std::cout << "Failed to create device for context" << std::endl;
+  }
+
+  auto devices = device::get_devices();
+  device &deviceA = devices[0];
+  device &deviceB = (devices.size() > 1 ? devices[1] : devices[0]);
+  {
+    std::cout << "move constructor" << std::endl;
+    context Context(deviceA);
+    size_t hash = hash_class<context>()(Context);
+    context MovedContext(std::move(Context));
+    assert(hash == hash_class<context>()(MovedContext));
+    assert(deviceA.is_host() == MovedContext.is_host());
+    if (!deviceA.is_host()) {
+      assert(MovedContext.get() != nullptr);
+    }
+  }
+  {
+    std::cout << "move assignment operator" << std::endl;
+    context Context(deviceA);
+    size_t hash = hash_class<context>()(Context);
+    context WillMovedContext(deviceB);
+    WillMovedContext = std::move(Context);
+    assert(hash == hash_class<context>()(WillMovedContext));
+    assert(deviceA.is_host() == WillMovedContext.is_host());
+    if (!deviceA.is_host()) {
+      assert(WillMovedContext.get() != nullptr);
+    }
+  }
+  {
+    std::cout << "copy constructor" << std::endl;
+    context Context(deviceA);
+    size_t hash = hash_class<context>()(Context);
+    context ContextCopy(Context);
+    assert(hash == hash_class<context>()(Context));
+    assert(hash == hash_class<context>()(ContextCopy));
+    assert(Context == ContextCopy);
+    assert(Context.is_host() == ContextCopy.is_host());
+  }
+  {
+    std::cout << "copy assignment operator" << std::endl;
+    context Context(deviceA);
+    size_t hash = hash_class<context>()(Context);
+    context WillContextCopy(deviceB);
+    WillContextCopy = Context;
+    assert(hash == hash_class<context>()(Context));
+    assert(hash == hash_class<context>()(WillContextCopy));
+    assert(Context == WillContextCopy);
+    assert(Context.is_host() == WillContextCopy.is_host());
+  }
+}
diff --git a/sycl/test/basic_tests/device.cpp b/sycl/test/basic_tests/device.cpp
new file mode 100644
index 000000000000..73759a595618
--- /dev/null
+++ b/sycl/test/basic_tests/device.cpp
@@ -0,0 +1,115 @@
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+
+//==--------------- device.cpp - SYCL device test --------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <iostream>
+#include <utility>
+
+using namespace cl::sycl;
+
+string_class get_type(const device &dev) {
+  if (dev.is_host()) {
+    return "host";
+  } else if (dev.is_gpu()) {
+    return "OpenCL.GPU";
+  } else if (dev.is_accelerator()) {
+    return "OpenCL.ACC";
+  } else {
+    return "OpenCL.CPU";
+  }
+}
+
+int main() {
+  device d;
+  std::cout << "Default device type: " << get_type(d) << std::endl;
+
+  int i = 1;
+  std::cout << "Get all devices in the system" << std::endl;
+  for (const auto &dev : device::get_devices()) {
+    std::cout << "Device " << i++ << " is available: " << get_type(dev)
+              << std::endl;
+  }
+  i = 1;
+  std::cout << "Get host devices in the system" << std::endl;
+  for (const auto &dev : device::get_devices(info::device_type::host)) {
+    std::cout << "Device " << i++ << " is available: " << get_type(dev)
+              << std::endl;
+  }
+  i = 1;
+  std::cout << "Get OpenCL.CPU devices in the system" << std::endl;
+  for (const auto &dev : device::get_devices(info::device_type::cpu)) {
+    std::cout << "Device " << i++ << " is available: " << get_type(dev)
+              << std::endl;
+  }
+  i = 1;
+  std::cout << "Get OpenCL.GPU devices in the system" << std::endl;
+  for (const auto &dev : device::get_devices(info::device_type::gpu)) {
+    std::cout << "Device " << i++ << " is available: " << get_type(dev)
+              << std::endl;
+  }
+  i = 1;
+  std::cout << "Get OpenCL.ACC devices in the system" << std::endl;
+  for (const auto &dev : device::get_devices(info::device_type::accelerator)) {
+    std::cout << "Device " << i++ << " is available: " << get_type(dev)
+              << std::endl;
+  }
+
+  auto devices = device::get_devices();
+  device &deviceA = devices[0];
+  device &deviceB = (devices.size() > 1 ? devices[1] : devices[0]);
+  {
+    std::cout << "move constructor" << std::endl;
+    device Device(deviceA);
+    size_t hash = hash_class<device>()(Device);
+    device MovedDevice(std::move(Device));
+    assert(hash == hash_class<device>()(MovedDevice));
+    assert(deviceA.is_host() == MovedDevice.is_host());
+    if (!deviceA.is_host()) {
+      assert(MovedDevice.get() != nullptr);
+    }
+  }
+  {
+    std::cout << "move assignment operator" << std::endl;
+    device Device(deviceA);
+    size_t hash = hash_class<device>()(Device);
+    device WillMovedDevice(deviceB);
+    WillMovedDevice = std::move(Device);
+    assert(hash == hash_class<device>()(WillMovedDevice));
+    assert(deviceA.is_host() == WillMovedDevice.is_host());
+    if (!deviceA.is_host()) {
+      assert(WillMovedDevice.get() != nullptr);
+    }
+  }
+  {
+    std::cout << "copy constructor" << std::endl;
+    device Device(deviceA);
+    size_t hash = hash_class<device>()(Device);
+    device DeviceCopy(Device);
+    assert(hash == hash_class<device>()(Device));
+    assert(hash == hash_class<device>()(DeviceCopy));
+    assert(Device == DeviceCopy);
+    assert(Device.is_host() == DeviceCopy.is_host());
+  }
+  {
+    std::cout << "copy assignment operator" << std::endl;
+    device Device(deviceA);
+    size_t hash = hash_class<device>()(Device);
+    device WillDeviceCopy(deviceB);
+    WillDeviceCopy = Device;
+    assert(hash == hash_class<device>()(Device));
+    assert(hash == hash_class<device>()(WillDeviceCopy));
+    assert(Device == WillDeviceCopy);
+    assert(Device.is_host() == WillDeviceCopy.is_host());
+  }
+}
+
diff --git a/sycl/test/basic_tests/device_event.cpp b/sycl/test/basic_tests/device_event.cpp
new file mode 100644
index 000000000000..338f97bcdb0d
--- /dev/null
+++ b/sycl/test/basic_tests/device_event.cpp
@@ -0,0 +1,126 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.run -lOpenCL -lsycl -lstdc++
+// RUN: %GPU_RUN_PLACEHOLDER %t.run
+// RUN: %CPU_RUN_PLACEHOLDER %t.run
+// RUN: %ACC_RUN_PLACEHOLDER %t.run
+// RUNx (TODO: nd_item::barrier() is not implemented on HOST): env SYCL_DEVICE_TYPE=HOST %t.run
+
+//==--------device_event.cpp - SYCL class device_event test ----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <iostream>
+
+using namespace cl::sycl;
+
+// Define the number of work items to enqueue.
+const int nElems = 1024*1024u;
+const int workGroupSize = 16;
+
+// Check the result is correct.
+int check_results(int *data, size_t stride) {
+  int result = 0;
+  int earlyFailout = 1000;
+  for (int i = 0; i < nElems; i += workGroupSize) {
+    int copiedVal = i;
+    for (int j = 0; j < workGroupSize; j++) {
+      int expectedVal;
+      if ((j % stride) == 0) {
+        expectedVal = 300 + copiedVal;
+        copiedVal++;
+      }
+      else {
+        expectedVal = i + j + 700;
+      }
+      if (data[i + j] != expectedVal) {
+        std::cout << "fail: stride = " << stride << ", "
+                  << "data[" << i + j << "] = " << data[i + j]
+                  << "; expected result = " << expectedVal << "\n";
+        result = 1;
+        if (--earlyFailout == 0) {
+          return result;
+        }
+      }
+    }
+  }
+  return result;
+}
+
+int test_strideN(size_t stride) {
+  // Define and initialize data to be copied to the device.
+  int out_data[nElems] = {0};
+  int nElemsToCopy = (workGroupSize / stride);
+  if (workGroupSize % stride)
+    nElemsToCopy++;
+
+  try {
+    default_selector selector;
+    queue myQueue(selector, [](exception_list l) {
+      for (auto ep : l) {
+        try {
+          std::rethrow_exception(ep);
+        } catch (std::exception& e) {
+          std::cout << e.what();
+        }
+      }
+    });
+
+    buffer<int, 1> out_buf(out_data, range<1>(nElems));
+
+    myQueue.submit([&](handler& cgh) {
+
+      auto out_ptr = out_buf.get_access<access::mode::write>(cgh);
+      accessor<cl::sycl::cl_int, 1, access::mode::read_write, access::target::local>
+          local_acc(range<1>(16), cgh);
+
+      // Create work-groups with 16 work items in each group.
+      auto myRange = nd_range<1>(range<1>(nElems), range<1>(workGroupSize));
+
+      auto myKernel = ([=](nd_item<1> item) {
+        // Write the values 300, 301, ...., 363 to local memory.
+        // We expect to see these values in global memory
+        // after async mem copy that is done below.
+        local_acc[item.get_local_id()] = item.get_global_id()[0] + 300;
+
+        auto grp = item.get_group();
+        local_ptr<int> lptr = local_acc.get_pointer();
+        global_ptr<int> gptr = out_ptr.get_pointer() + grp.get_id()[0] * 16;
+
+        // Write the values 700, 701, ..., 763 to global memory.
+        // Why? Well, a) to ensure that something is written into that memory
+        // inside the work item; b) check possible mem write crazy effects,
+        // that are not supposed to happen, but who knows..., c) to see those
+        // values at the end if something goes wrong during the ASYNC MEM COPY.
+        out_ptr[item.get_global_id()[0]] = item.get_global_id()[0] + 700;
+
+        item.barrier();
+
+        // Copy from local memory to global memory.
+        device_event dev_event = grp.async_work_group_copy(gptr, lptr, nElemsToCopy, stride);
+        grp.wait_for(dev_event);
+      });
+
+      cgh.parallel_for<class assign_elements>(myRange, myKernel);
+    });
+
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    return 2;
+  }
+
+  return check_results(out_data, stride);
+}
+
+int main() {
+  for (int i = 1; i < workGroupSize; i++) {
+    int result = test_strideN(i);
+    if (result)
+      return result;
+  }
+
+  return 0;
+}
diff --git a/sycl/test/basic_tests/event.cpp b/sycl/test/basic_tests/event.cpp
new file mode 100644
index 000000000000..7ed59aca0891
--- /dev/null
+++ b/sycl/test/basic_tests/event.cpp
@@ -0,0 +1,76 @@
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+//==--------------- event.cpp - SYCL event test ----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <iostream>
+
+int main() {
+  try {
+    std::cout << "Create default event" << std::endl;
+    cl::sycl::event e;
+  } catch (cl::sycl::device_error e) {
+    std::cout << "Failed to create device for event" << std::endl;
+  }
+  try {
+    std::cout << "Try create OpenCL event" << std::endl;
+    cl::sycl::context c;
+    if (!c.is_host()) {
+      ::cl_int error;
+      cl_event u_e = clCreateUserEvent(c.get(), &error);
+      cl::sycl::event cl_e(u_e, c);
+      std::cout << "OpenCL event: " << std::hex << cl_e.get()
+                << ((cl_e.get() == u_e) ? " matches " : " does not match ")
+                << u_e << std::endl;
+
+    } else {
+      std::cout << "Failed to create OpenCL context" << std::endl;
+    }
+  } catch (cl::sycl::device_error e) {
+    std::cout << "Failed to create device for context" << std::endl;
+  }
+
+  {
+    std::cout << "move constructor" << std::endl;
+    cl::sycl::event Event;
+    size_t hash = std::hash<cl::sycl::event>()(Event);
+    cl::sycl::event MovedEvent(std::move(Event));
+    assert(hash == std::hash<cl::sycl::event>()(MovedEvent));
+  }
+
+  {
+    std::cout << "move assignment operator" << std::endl;
+    cl::sycl::event Event;
+    size_t hash = std::hash<cl::sycl::event>()(Event);
+    cl::sycl::event WillMovedEvent;
+    WillMovedEvent = std::move(Event);
+    assert(hash == std::hash<cl::sycl::event>()(WillMovedEvent));
+  }
+
+  {
+    std::cout << "copy constructor" << std::endl;
+    cl::sycl::event Event;
+    size_t hash = std::hash<cl::sycl::event>()(Event);
+    cl::sycl::event EventCopy(Event);
+    assert(hash == std::hash<cl::sycl::event>()(Event));
+    assert(hash == std::hash<cl::sycl::event>()(EventCopy));
+    assert(Event == EventCopy);
+  }
+
+  {
+    std::cout << "copy assignment operator" << std::endl;
+    cl::sycl::event Event;
+    size_t hash = std::hash<cl::sycl::event>()(Event);
+    cl::sycl::event WillEventCopy;
+    WillEventCopy = Event;
+    assert(hash == std::hash<cl::sycl::event>()(Event));
+    assert(hash == std::hash<cl::sycl::event>()(WillEventCopy));
+    assert(Event == WillEventCopy);
+  }
+}
diff --git a/sycl/test/basic_tests/group.cpp b/sycl/test/basic_tests/group.cpp
new file mode 100644
index 000000000000..bb90e84c47be
--- /dev/null
+++ b/sycl/test/basic_tests/group.cpp
@@ -0,0 +1,76 @@
+// RUN: %clang --sycl %s -c -o %T/kernel.spv
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: %t.out
+
+//==--------------- group.cpp - SYCL group test ----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <iostream>
+
+using namespace std;
+using cl::sycl::detail::Builder;
+
+int main() {
+  cl::sycl::group<1> one = Builder::createGroup<1>({8}, {4}, {1});
+  // one dimension group
+  cl::sycl::group<1> one_dim = Builder::createGroup<1>({8}, {4}, {1});
+  assert(one_dim.get_id() == cl::sycl::id<1>{1});
+  assert(one_dim.get_id(0) == 1);
+  assert((one_dim.get_global_range() == cl::sycl::range<1>{8}));
+  assert(one_dim.get_global_range(0) == 8);
+  assert((one_dim.get_local_range() == cl::sycl::range<1>{4}));
+  assert(one_dim.get_local_range(0) == 4);
+  assert((one_dim.get_group_range() == cl::sycl::range<1>{4}));
+  assert(one_dim.get_group_range(0) == 4);
+  assert(one_dim[0] == 1);
+  assert(one_dim.get_linear() == 1);
+
+  // two dimension group
+  cl::sycl::group<2> two_dim = Builder::createGroup<2>({8, 4}, {4, 2}, {1, 1});
+  assert((two_dim.get_id() == cl::sycl::id<2>{1, 1}));
+  assert(two_dim.get_id(0) == 1);
+  assert(two_dim.get_id(1) == 1);
+  assert((two_dim.get_global_range() == cl::sycl::range<2>{8, 4}));
+  assert(two_dim.get_global_range(0) == 8);
+  assert(two_dim.get_global_range(1) == 4);
+  assert((two_dim.get_local_range() == cl::sycl::range<2>{4, 2}));
+  assert(two_dim.get_local_range(0) == 4);
+  assert(two_dim.get_local_range(1) == 2);
+  assert((two_dim.get_group_range() == cl::sycl::range<2>{4, 2}));
+  assert(two_dim.get_group_range(0) == 4);
+  assert(two_dim.get_group_range(1) == 2);
+  assert(two_dim[0] == 1);
+  assert(two_dim[1] == 1);
+  assert(two_dim.get_linear() == 3);
+
+  // three dimension group
+  cl::sycl::group<3> three_dim =
+      Builder::createGroup<3>({16, 8, 4}, {8, 4, 2}, {1, 1, 1});
+  assert((three_dim.get_id() == cl::sycl::id<3>{1, 1, 1}));
+  assert(three_dim.get_id(0) == 1);
+  assert(three_dim.get_id(1) == 1);
+  assert(three_dim.get_id(2) == 1);
+  assert((three_dim.get_global_range() == cl::sycl::range<3>{16, 8, 4}));
+  assert(three_dim.get_global_range(0) == 16);
+  assert(three_dim.get_global_range(1) == 8);
+  assert(three_dim.get_global_range(2) == 4);
+  assert((three_dim.get_local_range() == cl::sycl::range<3>{8, 4, 2}));
+  assert(three_dim.get_local_range(0) == 8);
+  assert(three_dim.get_local_range(1) == 4);
+  assert(three_dim.get_local_range(2) == 2);
+  assert((three_dim.get_group_range() == cl::sycl::range<3>{8, 4, 2}));
+  assert(three_dim.get_group_range(0) == 8);
+  assert(three_dim.get_group_range(1) == 4);
+  assert(three_dim.get_group_range(2) == 2);
+  assert(three_dim[0] == 1);
+  assert(three_dim[1] == 1);
+  assert(three_dim[2] == 1);
+  assert(three_dim.get_linear() == 7);
+}
diff --git a/sycl/test/basic_tests/handler/handler_mem_op.cpp b/sycl/test/basic_tests/handler/handler_mem_op.cpp
new file mode 100644
index 000000000000..3732ce0daad6
--- /dev/null
+++ b/sycl/test/basic_tests/handler/handler_mem_op.cpp
@@ -0,0 +1,341 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+//==- handler.cpp - SYCL handler explicit memory operations test -*- C++-*--==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+#include <cassert>
+#include <iostream>
+
+using namespace cl::sycl;
+
+template <typename T> struct point {
+  point(const point &rhs) : x(rhs.x), y(rhs.y) {}
+  point(T x, T y) : x(x), y(y) {}
+  point(T v) : x(v), y(v) {}
+  point() : x(0), y(0) {}
+  bool operator==(const T &rhs) { return rhs == x && rhs == y; }
+  bool operator==(const point<T> &rhs) { return rhs.x == x && rhs.y == y; }
+  T x;
+  T y;
+};
+
+template <typename T> void test_fill(T Val);
+template <typename T> void test_copy_ptr_acc();
+template <typename T> void test_copy_acc_ptr();
+template <typename T> void test_copy_shared_ptr_acc();
+template <typename T> void test_copy_acc_shared_ptr();
+template <typename T> void test_copy_acc_acc();
+template <typename T> void test_update_host();
+template <typename T> void test_2D_copy_acc_acc();
+template <typename T> void test_3D_copy_acc_acc();
+
+int main() {
+  // handler.fill
+  {
+    test_fill<int>(888);
+    test_fill<int>(777);
+    test_fill<float>(888.0f);
+    test_fill<point<int>>(point<int>(111.0f, 222.0f));
+    test_fill<point<int>>(point<int>(333.0f));
+    test_fill<point<float>>(point<float>(444.0f, 555.0f));
+  }
+
+  // handler.copy(ptr, acc)
+  {
+    test_copy_ptr_acc<int>();
+    test_copy_ptr_acc<int>();
+    test_copy_ptr_acc<point<int>>();
+    test_copy_ptr_acc<point<int>>();
+    test_copy_ptr_acc<point<float>>();
+  }
+  // handler.copy(acc, ptr)
+  {
+    test_copy_acc_ptr<int>();
+    test_copy_acc_ptr<int>();
+    test_copy_acc_ptr<point<int>>();
+    test_copy_acc_ptr<point<int>>();
+    test_copy_acc_ptr<point<float>>();
+  }
+  // handler.copy(shared_ptr, acc)
+  {
+    test_copy_shared_ptr_acc<int>();
+    test_copy_shared_ptr_acc<int>();
+    test_copy_shared_ptr_acc<point<int>>();
+    test_copy_shared_ptr_acc<point<int>>();
+    test_copy_shared_ptr_acc<point<float>>();
+  }
+  // handler.copy(acc, shared_ptr)
+  {
+    test_copy_acc_shared_ptr<int>();
+    test_copy_acc_shared_ptr<int>();
+    test_copy_acc_shared_ptr<point<int>>();
+    test_copy_acc_shared_ptr<point<int>>();
+    test_copy_acc_shared_ptr<point<float>>();
+  }
+  // handler.copy(acc, acc)
+  {
+    test_copy_acc_acc<int>();
+    test_copy_acc_acc<int>();
+    test_copy_acc_acc<point<int>>();
+    test_copy_acc_acc<point<int>>();
+    test_copy_acc_acc<point<float>>();
+  }
+
+  // handler.update_host(acc)
+  {
+    test_update_host<int>();
+    test_update_host<int>();
+    test_update_host<point<int>>();
+    test_update_host<point<int>>();
+    test_update_host<point<float>>();
+  }
+
+  // handler.copy(acc, acc) 2D
+  {
+    test_2D_copy_acc_acc<int>();
+    test_2D_copy_acc_acc<int>();
+    test_2D_copy_acc_acc<point<int>>();
+    test_2D_copy_acc_acc<point<int>>();
+    test_2D_copy_acc_acc<point<float>>();
+  }
+
+  // handler.copy(acc, acc) 3D
+  {
+    test_3D_copy_acc_acc<int>();
+    test_3D_copy_acc_acc<int>();
+    test_3D_copy_acc_acc<point<int>>();
+    test_3D_copy_acc_acc<point<int>>();
+    test_3D_copy_acc_acc<point<float>>();
+  }
+
+  std::cout << "finish" << std::endl;
+  return 0;
+}
+
+template <typename T> void test_fill(T Val) {
+  const size_t Size = 10;
+  T Data[Size] = {0};
+  const T Value = Val;
+  {
+    buffer<T, 1> Buffer(Data, range<1>(Size));
+    queue Queue;
+    Queue.submit([&](handler &Cgh) {
+      accessor<T, 1, access::mode::write, access::target::global_buffer>
+          Accessor(Buffer, Cgh, range<1>(Size));
+      Cgh.fill(Accessor, Value);
+    });
+  }
+  for (size_t I = 0; I < Size; ++I) {
+    assert(Data[I] == Value);
+  }
+}
+
+template <typename T> void test_copy_ptr_acc() {
+  const size_t Size = 10;
+  T Data[Size] = {0};
+  T Values[Size] = {0};
+  for (size_t I = 0; I < Size; ++I) {
+    Values[I] = I;
+  }
+  {
+    buffer<T, 1> Buffer(Data, range<1>(Size));
+    queue Queue;
+    Queue.submit([&](handler &Cgh) {
+      accessor<T, 1, access::mode::write, access::target::global_buffer>
+          Accessor(Buffer, Cgh, range<1>(Size));
+      Cgh.copy(Values, Accessor);
+    });
+  }
+  for (size_t I = 0; I < Size; ++I) {
+    assert(Data[I] == Values[I]);
+  }
+}
+
+template <typename T> void test_copy_acc_ptr() {
+  const size_t Size = 10;
+  T Data[Size] = {0};
+  for (size_t I = 0; I < Size; ++I) {
+    Data[I] = I;
+  }
+  T Values[Size] = {0};
+  {
+    buffer<T, 1> Buffer(Data, range<1>(Size));
+    queue Queue;
+    Queue.submit([&](handler &Cgh) {
+      accessor<T, 1, access::mode::read, access::target::global_buffer>
+          Accessor(Buffer, Cgh, range<1>(Size));
+      Cgh.copy(Accessor, Values);
+    });
+  }
+  for (size_t I = 0; I < Size; ++I) {
+    assert(Data[I] == Values[I]);
+  }
+}
+
+template <typename T> void test_copy_shared_ptr_acc() {
+  const size_t Size = 10;
+  T Data[Size] = {0};
+  std::shared_ptr<T> Values(new T[Size]());
+  for (size_t I = 0; I < Size; ++I) {
+    Values.get()[I] = I;
+  }
+  {
+    buffer<T, 1> Buffer(Data, range<1>(Size));
+    queue Queue;
+    Queue.submit([&](handler &Cgh) {
+      accessor<T, 1, access::mode::write, access::target::global_buffer>
+          Accessor(Buffer, Cgh, range<1>(Size));
+      Cgh.copy(Values, Accessor);
+    });
+  }
+  for (size_t I = 0; I < Size; ++I) {
+    assert(Data[I] == Values.get()[I]);
+  }
+}
+
+template <typename T> void test_copy_acc_shared_ptr() {
+  const size_t Size = 10;
+  T Data[Size] = {0};
+  for (size_t I = 0; I < Size; ++I) {
+    Data[I] = I;
+  }
+  std::shared_ptr<T> Values(new T[Size]());
+  {
+    buffer<T, 1> Buffer(Data, range<1>(Size));
+    queue Queue;
+    Queue.submit([&](handler &Cgh) {
+      accessor<T, 1, access::mode::read, access::target::global_buffer>
+          Accessor(Buffer, Cgh, range<1>(Size));
+      Cgh.copy(Accessor, Values);
+    });
+  }
+  for (size_t I = 0; I < Size; ++I) {
+    assert(Data[I] == Values.get()[I]);
+  }
+}
+
+template <typename T> void test_copy_acc_acc() {
+  const size_t Size = 10;
+  T Data[Size] = {0};
+  for (size_t I = 0; I < Size; ++I) {
+    Data[I] = I;
+  }
+  T Values[Size] = {0};
+  {
+    buffer<T, 1> BufferFrom(Data, range<1>(Size));
+    buffer<T, 1> BufferTo(Values, range<1>(Size));
+    queue Queue;
+    Queue.submit([&](handler &Cgh) {
+      accessor<T, 1, access::mode::read, access::target::global_buffer>
+          AccessorFrom(BufferFrom, Cgh, range<1>(Size));
+      accessor<T, 1, access::mode::write, access::target::global_buffer>
+          AccessorTo(BufferTo, Cgh, range<1>(Size));
+      Cgh.copy(AccessorFrom, AccessorTo);
+    });
+  }
+  for (size_t I = 0; I < Size; ++I) {
+    assert(Data[I] == Values[I]);
+  }
+}
+
+/* This is the class used to name the kernel for the runtime.
+ * This must be done when the kernel is expressed as a lambda. */
+template <typename T> class rawPointer;
+
+template <typename T> void test_update_host() {
+  const size_t Size = 10;
+  T Data[Size] = {0};
+  {
+    auto Buffer =
+        buffer<T, 1>(Data, range<1>(Size), {property::buffer::use_host_ptr()});
+    Buffer.set_final_data(nullptr);
+    queue Queue;
+    Queue.submit([&](handler &Cgh) {
+      accessor<T, 1, access::mode::write, access::target::global_buffer>
+          Accessor(Buffer, Cgh, range<1>(Size));
+              Cgh.parallel_for<class rawPointer<T>>(range<1>{Size},
+                                         [=](id<1> Index) {
+                Accessor[Index] = Index.get(0); });
+    });
+    Queue.submit([&](handler &Cgh) {
+      accessor<T, 1, access::mode::write, access::target::global_buffer>
+          Accessor(Buffer, Cgh, range<1>(Size));
+      Cgh.update_host(Accessor);
+    });
+  }
+  for (size_t I = 0; I < Size; ++I) {
+    assert(Data[I] == I);
+  }
+}
+
+template <typename T> void test_2D_copy_acc_acc() {
+  const size_t Size = 20;
+  T Data[Size][Size] = {{0}};
+  for (size_t I = 0; I < Size; ++I) {
+    for (size_t J = 0; J < Size; ++J) {
+      Data[I][J] = I + J * Size;
+    }
+  }
+  T Values[Size][Size] = {{0}};
+  {
+    buffer<T, 2> BufferFrom((T *)Data, range<2>(Size, Size));
+    buffer<T, 2> BufferTo((T *)Values, range<2>(Size, Size));
+    queue Queue;
+    Queue.submit([&](handler &Cgh) {
+      accessor<T, 2, access::mode::read, access::target::global_buffer>
+          AccessorFrom(BufferFrom, Cgh, range<2>(Size, Size));
+      accessor<T, 2, access::mode::write, access::target::global_buffer>
+          AccessorTo(BufferTo, Cgh, range<2>(Size, Size));
+      Cgh.copy(AccessorFrom, AccessorTo);
+    });
+  }
+
+  for (size_t I = 0; I < Size; ++I) {
+    for (size_t J = 0; J < Size; ++J) {
+      assert(Data[I][J] == Values[I][J]);
+    }
+  }
+}
+
+template <typename T> void test_3D_copy_acc_acc() {
+  const size_t Size = 20;
+  T Data[Size][Size][Size] = {{{0}}};
+  for (size_t I = 0; I < Size; ++I) {
+    for (size_t J = 0; J < Size; ++J) {
+      for (size_t K = 0; K < Size; ++K) {
+        Data[I][J][K] = I + J * Size + K * Size * Size;
+      }
+    }
+  }
+  T Values[Size][Size][Size] = {{{0}}};
+  {
+    buffer<T, 3> BufferFrom((T *)Data, range<3>(Size, Size, Size));
+    buffer<T, 3> BufferTo((T *)Values, range<3>(Size, Size, Size));
+    queue Queue;
+    Queue.submit([&](handler &Cgh) {
+      accessor<T, 3, access::mode::read, access::target::global_buffer>
+          AccessorFrom(BufferFrom, Cgh, range<3>(Size, Size, Size));
+      accessor<T, 3, access::mode::write, access::target::global_buffer>
+          AccessorTo(BufferTo, Cgh, range<3>(Size, Size, Size));
+      Cgh.copy(AccessorFrom, AccessorTo);
+    });
+  }
+
+  for (size_t I = 0; I < Size; ++I) {
+    for (size_t J = 0; J < Size; ++J) {
+      for (size_t K = 0; K < Size; ++K) {
+        assert(Data[I][J][K] == Values[I][J][K]);
+      }
+    }
+  }
+}
diff --git a/sycl/test/basic_tests/id.cpp b/sycl/test/basic_tests/id.cpp
new file mode 100644
index 000000000000..fe858cf2a051
--- /dev/null
+++ b/sycl/test/basic_tests/id.cpp
@@ -0,0 +1,102 @@
+// RUN: %clang --sycl %s -c -o %T/kernel.spv
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: %t.out
+
+//==--------------- id.cpp - SYCL id test ----------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <iostream>
+
+using cl::sycl::detail::Builder;
+
+using namespace std;
+int main() {
+  /* id()
+   * Construct a SYCL id with the value 0 for each dimension. */
+  cl::sycl::id<1> one_dim_zero_id;
+  assert(one_dim_zero_id.get(0) == 0);
+  cl::sycl::id<2> two_dim_zero_id;
+  assert(two_dim_zero_id.get(0) == 0 && two_dim_zero_id.get(1) == 0);
+  cl::sycl::id<3> three_dim_zero_id;
+  assert(three_dim_zero_id.get(0) == 0 && three_dim_zero_id.get(1) == 0 &&
+         three_dim_zero_id.get(2) == 0);
+
+  /* id(size_t dim0)
+   * Construct a 1D id with value dim0. Only valid when the template parameter
+   * dimensions is equal to 1 */
+  cl::sycl::id<1> one_dim_id(64);
+  assert(one_dim_id.get(0) == 64);
+
+  /* id(size_t dim0, size_t dim1)
+   * Construct a 2D id with values dim0, dim1. Only valid when the template
+   * parameter dimensions is equal to 2. */
+  cl::sycl::id<2> two_dim_id(128, 256);
+  assert(two_dim_id.get(0) == 128 && two_dim_id.get(1) == 256);
+
+  /* id(size_t dim0, size_t dim1, size_t dim2)
+   * Construct a 3D id with values dim0, dim1, dim2. Only valid when the
+   * template parameter dimensions is equal to 3. */
+  cl::sycl::id<3> three_dim_id(64, 1, 2);
+  assert(three_dim_id.get(0) == 64 && three_dim_id.get(1) == 1 &&
+         three_dim_id.get(2) == 2);
+
+  /* id(const range<dimensions> &range)
+   * Construct an id from the dimensions of r. */
+  cl::sycl::range<1> one_dim_range(2);
+  cl::sycl::id<1> one_dim_id_range(one_dim_range);
+  assert(one_dim_id_range.get(0) == 2);
+  cl::sycl::range<2> two_dim_range(4, 8);
+  cl::sycl::id<2> two_dim_id_range(two_dim_range);
+  assert(two_dim_id_range.get(0) == 4 && two_dim_id_range.get(1) == 8);
+  cl::sycl::range<3> three_dim_range(16, 32, 64);
+  cl::sycl::id<3> three_dim_id_range(three_dim_range);
+  assert(three_dim_id_range.get(0) == 16 && three_dim_id_range.get(1) == 32 &&
+         three_dim_id_range.get(2) == 64);
+
+  /* id(const item<dimensions> &item)
+   * Construct an id from item.get_id().*/
+  cl::sycl::item<1, true> one_dim_item_with_offset =
+      Builder::createItem<1, true>({4}, {2}, {1});
+  cl::sycl::id<1> one_dim_id_item(one_dim_item_with_offset);
+  assert(one_dim_id_item.get(0) == 2);
+  cl::sycl::item<2, true> two_dim_item_with_offset =
+      Builder::createItem<2, true>({8, 16}, {4, 8}, {1, 1});
+  cl::sycl::id<2> two_dim_id_item(two_dim_item_with_offset);
+  assert(two_dim_id_item.get(0) == 4 && two_dim_id_item.get(1) == 8);
+  cl::sycl::item<3, true> three_dim_item_with_offset =
+      Builder::createItem<3, true>({32, 64, 128}, {16, 32, 64}, {1, 1, 1});
+  cl::sycl::id<3> three_dim_id_item(three_dim_item_with_offset);
+  assert(three_dim_id_item.get(0) == 16 && three_dim_id_item.get(1) == 32 &&
+         three_dim_id_item.get(2) == 64);
+  /* size_t get(int dimension)const
+   * Return the value of the id for dimension dimension. */
+
+  /* size_t &operator[](int dimension)const
+   * Return a reference to the requested dimension of the id object. */
+
+  /* size_t &operator[](int dimension)const
+   * Return a reference to the requested dimension of the id object. */
+
+  /* id<dimensions> operatorOP(const id<dimensions> &rhs) const
+   * Where OP is: +, -, *, /, %, <<, >>, &, |, ^, &&, ||, <, >, <=, >=.
+   * Constructs and returns a new instance of the SYCL id class template with
+   * the same dimensionality as this SYCL id, where each element of the new SYCL
+   * id instance is the result of an element-wise OP operator between each
+   * element of this SYCL id and each element of the rhs id. If the operator
+   * returns a bool the result is the cast to size_t */
+
+  /* id<dimensions> operatorOP(const id<dimensions> &rhs) const
+   * Where OP is: +, -, *, /, %, <<, >>, &, |, ^, &&, ||, <, >, <=, >=.
+   * Constructs and returns a new instance of the SYCL id class template with
+   * the same dimensionality as this SYCL id, where each element of the new SYCL
+   * id instance is the result of an element-wise OP operator between each
+   * element of this SYCL id and each element of the rhs id. If the operator
+   * returns a bool the result is the cast to size_t */
+}
diff --git a/sycl/test/basic_tests/info.cpp b/sycl/test/basic_tests/info.cpp
new file mode 100644
index 000000000000..7afd9fc4cc14
--- /dev/null
+++ b/sycl/test/basic_tests/info.cpp
@@ -0,0 +1,347 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==----------------info.cpp - SYCL objects get_info() test ----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <string>
+
+using namespace cl::sycl;
+
+template <typename T> std::string info_to_string(T info) {
+  return std::to_string(info);
+}
+
+template <> std::string info_to_string(string_class info) {
+  if (info.empty()) {
+    return "none";
+  }
+  return info;
+}
+
+template <> std::string info_to_string(bool info) {
+  if (info) {
+    return "true";
+  }
+  return "false";
+}
+template <> std::string info_to_string(info::device_type info) {
+  switch (info) {
+  case info::device_type::cpu:
+    return "cpu";
+  case info::device_type::gpu:
+    return "gpu";
+  case info::device_type::accelerator:
+    return "accelerator";
+  case info::device_type::custom:
+    return "custom";
+  case info::device_type::automatic:
+    return "automatic";
+  case info::device_type::host:
+    return "host";
+  case info::device_type::all:
+    return "all";
+  }
+}
+
+template <> std::string info_to_string(info::fp_config info) {
+  switch (info) {
+  case info::fp_config::denorm:
+    return "denorm";
+  case info::fp_config::inf_nan:
+    return "inf_nan";
+  case info::fp_config::round_to_nearest:
+    return "round_to_nearest";
+  case info::fp_config::round_to_zero:
+    return "round_to_zero";
+  case info::fp_config::round_to_inf:
+    return "round_to_inf";
+  case info::fp_config::fma:
+    return "fma";
+  case info::fp_config::correctly_rounded_divide_sqrt:
+    return "correctly_rounded_divide_sqrt";
+  case info::fp_config::soft_float:
+    return "soft_float";
+  }
+}
+
+template <> std::string info_to_string(info::global_mem_cache_type info) {
+  switch (info) {
+  case info::global_mem_cache_type::none:
+    return "none";
+  case info::global_mem_cache_type::read_only:
+    return "read_only";
+  case info::global_mem_cache_type::write_only:
+    return "write_only";
+  }
+}
+
+template <> std::string info_to_string(info::local_mem_type info) {
+  switch (info) {
+  case info::local_mem_type::none:
+    return "none";
+  case info::local_mem_type::local:
+    return "local";
+  case info::local_mem_type::global:
+    return "global";
+  }
+}
+
+template <> std::string info_to_string(info::execution_capability info) {
+  switch (info) {
+  case info::execution_capability::exec_kernel:
+    return "exec_kernel";
+  case info::execution_capability::exec_native_kernel:
+    return "exec_native_kernel";
+  }
+}
+
+template <> std::string info_to_string(info::partition_property info) {
+  switch (info) {
+  case info::partition_property::no_partition:
+    return "no_partition";
+  case info::partition_property::partition_equally:
+    return "partition_equally";
+  case info::partition_property::partition_by_counts:
+    return "partition_by_counts";
+  case info::partition_property::partition_by_affinity_domain:
+    return "partition_by_affinity_domain";
+  }
+}
+
+template <> std::string info_to_string(info::partition_affinity_domain info) {
+  switch (info) {
+  case info::partition_affinity_domain::not_applicable:
+    return "not_applicable";
+  case info::partition_affinity_domain::numa:
+    return "numa";
+  case info::partition_affinity_domain::L4_cache:
+    return "L4_cache";
+  case info::partition_affinity_domain::L3_cache:
+    return "L3_cache";
+  case info::partition_affinity_domain::L2_cache:
+    return "L2_cache";
+  case info::partition_affinity_domain::L1_cache:
+    return "L1_cache";
+  case info::partition_affinity_domain::next_partitionable:
+    return "next_partitionable";
+  }
+}
+
+template <> std::string info_to_string(platform info) {
+  if (info.is_host()) {
+    return "SYCL host platform";
+  }
+  return "SYCL OpenCL platform";
+}
+
+template <> std::string info_to_string(device info) {
+  if (info.is_host()) {
+    return "SYCL host device";
+  }
+  return "SYCL OpenCL device";
+}
+
+template <> std::string info_to_string(id<3> info) {
+  std::string str;
+  for (size_t i = 0; i < 3; ++i) {
+    str += info_to_string(info[i]) + " ";
+  }
+  return str;
+}
+
+template <typename T> std::string info_to_string(vector_class<T> info) {
+  if (info.empty()) {
+    return "none";
+  }
+  std::string str;
+  for (const auto &x : info) {
+    str += info_to_string(x) + " ";
+  }
+  return str;
+}
+
+template <info::device param, typename T>
+void print_info(const device &dev, const std::string &name) {
+  T result(dev.get_info<param>());
+  std::cout << name << ": " << info_to_string(result) << std::endl;
+}
+
+template <info::platform param, typename T>
+void print_info(const platform &plt, const std::string &name) {
+  T result(plt.get_info<param>());
+  std::cout << name << ": " << info_to_string(result) << std::endl;
+}
+
+int main() {
+  std::string separator(std::string(80, '-') + "\n");
+  std::cout << separator << "Device information\n" << separator;
+  default_selector selector;
+  device dev(selector.select_device());
+
+  print_info<info::device::device_type, info::device_type>(dev, "Device type");
+  print_info<info::device::vendor_id, cl::sycl::cl_uint>(dev, "Vendor ID");
+  print_info<info::device::max_compute_units, cl::sycl::cl_uint>(
+      dev, "Max compute units");
+  print_info<info::device::max_work_item_dimensions, cl::sycl::cl_uint>(
+      dev, "Max work item dimensions");
+  print_info<info::device::max_work_item_sizes, id<3>>(dev,
+                                                       "Max work item sizes");
+  print_info<info::device::max_work_group_size, size_t>(dev,
+                                                        "Max work group size");
+  print_info<info::device::preferred_vector_width_char, cl::sycl::cl_uint>(
+      dev, "Preferred vector width char");
+  print_info<info::device::preferred_vector_width_short, cl::sycl::cl_uint>(
+      dev, "Preferred vector width short");
+  print_info<info::device::preferred_vector_width_int, cl::sycl::cl_uint>(
+      dev, "Preferred vector width int");
+  print_info<info::device::preferred_vector_width_long, cl::sycl::cl_uint>(
+      dev, "Preferred vector width long");
+  print_info<info::device::preferred_vector_width_float, cl::sycl::cl_uint>(
+      dev, "Preferred vector width float");
+  print_info<info::device::preferred_vector_width_double, cl::sycl::cl_uint>(
+      dev, "Preferred vector width double");
+  print_info<info::device::preferred_vector_width_half, cl::sycl::cl_uint>(
+      dev, "Preferred vector width half");
+  print_info<info::device::native_vector_width_char, cl::sycl::cl_uint>(
+      dev, "Native vector width char");
+  print_info<info::device::native_vector_width_short, cl::sycl::cl_uint>(
+      dev, "Native vector width short");
+  print_info<info::device::native_vector_width_int, cl::sycl::cl_uint>(
+      dev, "Native vector width int");
+  print_info<info::device::native_vector_width_long, cl::sycl::cl_uint>(
+      dev, "Native vector width long");
+  print_info<info::device::native_vector_width_float, cl::sycl::cl_uint>(
+      dev, "Native vector width float");
+  print_info<info::device::native_vector_width_double, cl::sycl::cl_uint>(
+      dev, "Native vector width double");
+  print_info<info::device::native_vector_width_half, cl::sycl::cl_uint>(
+      dev, "Native vector width half");
+  /*TODO: uncomment when problem with frequency detection is fixed
+  print_info<info::device::max_clock_frequency, cl::sycl::cl_uint>(
+      dev, "Max clock frequency");*/
+  print_info<info::device::address_bits, cl::sycl::cl_uint>(dev,
+                                                            "Address bits");
+  print_info<info::device::max_mem_alloc_size, cl::sycl::cl_ulong>(
+      dev, "Max mem alloc size");
+  print_info<info::device::image_support, bool>(dev, "Image support");
+  print_info<info::device::max_read_image_args, cl::sycl::cl_uint>(
+      dev, "Max read image args");
+  print_info<info::device::max_write_image_args, cl::sycl::cl_uint>(
+      dev, "Max write image args");
+  print_info<info::device::image2d_max_width, size_t>(dev, "Image2D max width");
+  print_info<info::device::image2d_max_height, size_t>(dev,
+                                                       "Image2D max height");
+  print_info<info::device::image3d_max_width, size_t>(dev, "Image3D max width");
+  print_info<info::device::image3d_max_height, size_t>(dev,
+                                                       "Image3D max height");
+  print_info<info::device::image3d_max_depth, size_t>(dev, "Image3D max depth");
+  print_info<info::device::image_max_buffer_size, size_t>(
+      dev, "Image max buffer size");
+  print_info<info::device::image_max_array_size, size_t>(
+      dev, "Image max array size");
+  print_info<info::device::max_samplers, cl::sycl::cl_uint>(dev,
+                                                            "Max samplers");
+  print_info<info::device::max_parameter_size, size_t>(dev,
+                                                       "Max parameter size");
+  print_info<info::device::mem_base_addr_align, cl::sycl::cl_uint>(
+      dev, "Mem base addr align");
+  print_info<info::device::half_fp_config, vector_class<info::fp_config>>(
+      dev, "Half fp config");
+  print_info<info::device::single_fp_config, vector_class<info::fp_config>>(
+      dev, "Single fp config");
+  print_info<info::device::double_fp_config, vector_class<info::fp_config>>(
+      dev, "Double fp config");
+  print_info<info::device::global_mem_cache_type, info::global_mem_cache_type>(
+      dev, "Global mem cache type");
+  print_info<info::device::global_mem_cache_line_size, cl::sycl::cl_uint>(
+      dev, "Global mem cache line size");
+  print_info<info::device::global_mem_cache_size, cl::sycl::cl_ulong>(
+      dev, "Global mem cache size");
+  print_info<info::device::global_mem_size, cl::sycl::cl_ulong>(
+      dev, "Global mem size");
+  print_info<info::device::max_constant_buffer_size, cl::sycl::cl_ulong>(
+      dev, "Max constant buffer size");
+  print_info<info::device::max_constant_args, cl::sycl::cl_uint>(
+      dev, "Max constant args");
+  print_info<info::device::local_mem_type, info::local_mem_type>(
+      dev, "Local mem type");
+  print_info<info::device::local_mem_size, cl::sycl::cl_ulong>(
+      dev, "Local mem size");
+  print_info<info::device::error_correction_support, bool>(
+      dev, "Error correction support");
+  print_info<info::device::host_unified_memory, bool>(dev,
+                                                      "Host unified memory");
+  print_info<info::device::profiling_timer_resolution, size_t>(
+      dev, "Profiling timer resolution");
+  print_info<info::device::is_endian_little, bool>(dev, "Is endian little");
+  print_info<info::device::is_available, bool>(dev, "Is available");
+  print_info<info::device::is_compiler_available, bool>(
+      dev, "Is compiler available");
+  print_info<info::device::is_linker_available, bool>(dev,
+                                                      "Is linker available");
+  print_info<info::device::execution_capabilities,
+             vector_class<info::execution_capability>>(
+      dev, "Execution capabilities");
+  print_info<info::device::queue_profiling, bool>(dev, "Queue profiling");
+  print_info<info::device::built_in_kernels, vector_class<string_class>>(
+      dev, "Built in kernels");
+  print_info<info::device::platform, platform>(dev, "Platform");
+  print_info<info::device::name, string_class>(dev, "Name");
+  print_info<info::device::vendor, string_class>(dev, "Vendor");
+  print_info<info::device::driver_version, string_class>(dev, "Driver version");
+  print_info<info::device::profile, string_class>(dev, "Profile");
+  print_info<info::device::version, string_class>(dev, "Version");
+  print_info<info::device::opencl_c_version, string_class>(dev,
+                                                           "OpenCL C version");
+  print_info<info::device::extensions, vector_class<string_class>>(
+      dev, "Extensions");
+  print_info<info::device::printf_buffer_size, size_t>(dev,
+                                                       "Printf buffer size");
+  print_info<info::device::preferred_interop_user_sync, bool>(
+      dev, "Preferred interop user sync");
+  // TODO test once subdevice creation is enabled
+  // print_info<info::device::parent_device, device>(dev, "Parent device");
+  if(!dev.is_host()){
+    try {
+      print_info<info::device::parent_device, device>(dev, "Parent device");
+    } catch (invalid_object_error e) {
+      std::cout << "Expected exception has been caught: " << e.what()
+                << std::endl;
+    }
+  }
+  print_info<info::device::partition_max_sub_devices, cl::sycl::cl_uint>(
+      dev, "Partition max sub devices");
+  print_info<info::device::partition_properties,
+             vector_class<info::partition_property>>(dev,
+                                                     "Partition properties");
+  print_info<info::device::partition_affinity_domains,
+             vector_class<info::partition_affinity_domain>>(
+      dev, "Partition affinity domains");
+  // TODO test once subdevice creation is enabled
+  // print_info<info::device::partition_type_property,
+  // info::partition_property>(dev, "Partition type property");
+  print_info<info::device::partition_type_affinity_domain,
+             info::partition_affinity_domain>(dev,
+                                              "Partition type affinity domain");
+  print_info<info::device::reference_count, cl::sycl::cl_uint>(
+      dev, "Reference count");
+
+  std::cout << separator << "Platform information\n" << separator;
+  platform plt(dev.get_platform());
+  print_info<info::platform::profile, string_class>(plt, "Profile");
+  print_info<info::platform::version, string_class>(plt, "Version");
+  print_info<info::platform::name, string_class>(plt, "Name");
+  print_info<info::platform::vendor, string_class>(plt, "Vendor");
+  print_info<info::platform::extensions, vector_class<string_class>>(
+      plt, "Extensions");
+}
diff --git a/sycl/test/basic_tests/item.cpp b/sycl/test/basic_tests/item.cpp
new file mode 100644
index 000000000000..82e9b21b0b77
--- /dev/null
+++ b/sycl/test/basic_tests/item.cpp
@@ -0,0 +1,113 @@
+// RUN: %clang --sycl %s -c -o %T/kernel.spv
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: %t.out
+//==--------------- item.cpp - SYCL item test ------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <iostream>
+
+using cl::sycl::detail::Builder;
+
+int main() {
+  // one dimension item with offset
+  cl::sycl::item<1, true> one_dim_with_offset =
+      Builder::createItem<1, true>({4}, {2}, {1});
+  assert(one_dim_with_offset.get_id() == cl::sycl::id<1>{2});
+  assert(one_dim_with_offset.get_id(0) == 2);
+  assert(one_dim_with_offset[0] == 2);
+  assert(one_dim_with_offset.get_range() == cl::sycl::range<1>{4});
+  assert(one_dim_with_offset.get_range(0) == 4);
+  assert(one_dim_with_offset.get_offset() == cl::sycl::id<1>{1});
+  assert(one_dim_with_offset.get_linear_id() == 1);
+
+  // two dimension item with offset
+  cl::sycl::item<2, true> two_dim_with_offset =
+      Builder::createItem<2, true>({4, 8}, {2, 4}, {1, 1});
+  assert((two_dim_with_offset.get_id() == cl::sycl::id<2>{2, 4}));
+  assert(two_dim_with_offset.get_id(0) == 2);
+  assert(two_dim_with_offset.get_id(1) == 4);
+  assert(two_dim_with_offset[0] == 2);
+  assert(two_dim_with_offset[1] == 4);
+  assert((two_dim_with_offset.get_range() == cl::sycl::range<2>{4, 8}));
+  assert((two_dim_with_offset.get_range(0) == 4));
+  assert((two_dim_with_offset.get_range(1) == 8));
+  assert((two_dim_with_offset.get_offset() == cl::sycl::id<2>{1, 1}));
+  assert(two_dim_with_offset.get_linear_id() == 11);
+
+  // three dimension item with offset
+  cl::sycl::item<3, true> three_dim_with_offset =
+      Builder::createItem<3, true>({4, 8, 16}, {2, 4, 8}, {1, 1, 1});
+  assert((three_dim_with_offset.get_id() == cl::sycl::id<3>{2, 4, 8}));
+  assert(three_dim_with_offset.get_id(0) == 2);
+  assert(three_dim_with_offset.get_id(1) == 4);
+  assert(three_dim_with_offset.get_id(2) == 8);
+  assert(three_dim_with_offset[0] == 2);
+  assert(three_dim_with_offset[1] == 4);
+  assert(three_dim_with_offset[2] == 8);
+  assert((three_dim_with_offset.get_range() == cl::sycl::range<3>{4, 8, 16}));
+  assert((three_dim_with_offset.get_range(0) == 4));
+  assert((three_dim_with_offset.get_range(1) == 8));
+  assert((three_dim_with_offset.get_range(2) == 16));
+  assert((three_dim_with_offset.get_offset() == cl::sycl::id<3>{1, 1, 1}));
+  assert(three_dim_with_offset.get_linear_id() == 183);
+
+  // one dimension item without offset
+  cl::sycl::item<1, false> one_dim = Builder::createItem<1, false>({4}, {2});
+  assert(one_dim.get_id() == cl::sycl::id<1>{2});
+  assert(one_dim.get_id(0) == 2);
+  assert(one_dim[0] == 2);
+  assert(one_dim.get_range() == cl::sycl::range<1>{4});
+  assert(one_dim.get_range(0) == 4);
+  assert(one_dim.get_linear_id() == 2);
+
+  // two dimension item without offset
+  cl::sycl::item<2, false> two_dim =
+      Builder::createItem<2, false>({4, 8}, {2, 4});
+  assert((two_dim.get_id() == cl::sycl::id<2>{2, 4}));
+  assert(two_dim.get_id(0) == 2);
+  assert(two_dim.get_id(1) == 4);
+  assert(two_dim[0] == 2);
+  assert(two_dim[1] == 4);
+  assert((two_dim.get_range() == cl::sycl::range<2>{4, 8}));
+  assert((two_dim.get_range(0) == 4));
+  assert((two_dim.get_range(1) == 8));
+  assert(two_dim.get_linear_id() == 20);
+
+  // three dimension item without offset
+  cl::sycl::item<3, false> three_dim =
+      Builder::createItem<3, false>({4, 8, 16}, {2, 4, 8});
+  assert((three_dim.get_id() == cl::sycl::id<3>{2, 4, 8}));
+  assert(three_dim.get_id(0) == 2);
+  assert(three_dim.get_id(1) == 4);
+  assert(three_dim.get_id(2) == 8);
+  assert(three_dim[0] == 2);
+  assert(three_dim[1] == 4);
+  assert(three_dim[2] == 8);
+  assert((three_dim.get_range() == cl::sycl::range<3>{4, 8, 16}));
+  assert((three_dim.get_range(0) == 4));
+  assert((three_dim.get_range(1) == 8));
+  assert((three_dim.get_range(2) == 16));
+  assert(three_dim.get_linear_id() == 328);
+
+  // A conversion to item with offset
+  cl::sycl::item<1, true> one_dim_transformed = one_dim;
+  cl::sycl::item<1, true> one_dim_check =
+      Builder::createItem<1, true>({4}, {2}, {0});
+  assert(one_dim_transformed == one_dim_check);
+  cl::sycl::item<2, true> two_dim_transformed = two_dim;
+  cl::sycl::item<2, true> two_dim_check =
+      Builder::createItem<2, true>({4, 8}, {2, 4}, {0, 0});
+  assert(two_dim_transformed == two_dim_check);
+  cl::sycl::item<3, true> three_dim_transformed = three_dim;
+  cl::sycl::item<3, true> three_dim_check =
+      Builder::createItem<3, true>({4, 8, 16}, {2, 4, 8}, {0, 0, 0});
+  assert((three_dim_transformed == three_dim_check));
+}
+
diff --git a/sycl/test/basic_tests/nd_item.cpp b/sycl/test/basic_tests/nd_item.cpp
new file mode 100644
index 000000000000..836f8a272fae
--- /dev/null
+++ b/sycl/test/basic_tests/nd_item.cpp
@@ -0,0 +1,110 @@
+// RUN: %clang --sycl %s -c -o %T/kernel.spv
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: %t.out
+//==--------------- nd_item.cpp - SYCL nd_item test ------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <iostream>
+
+using namespace std;
+using cl::sycl::detail::Builder;
+
+int main() {
+  // one dimension nd_item
+  cl::sycl::nd_item<1> one_dim =
+      Builder::createNDItem<1>(Builder::createItem<1, true>({4}, {2}, {0}),
+                               Builder::createItem<1, false>({2}, {0}),
+                               Builder::createGroup<1>({4}, {2}, {1}));
+  assert((one_dim.get_global_id() == cl::sycl::id<1>{2}));
+  assert(one_dim.get_global_id(0) == 2);
+  assert(one_dim.get_global_linear_id() == 2);
+  assert((one_dim.get_local_id() == cl::sycl::id<1>{0}));
+  assert(one_dim.get_local_id(0) == 0);
+  assert(one_dim.get_local_linear_id() == 0);
+  assert((one_dim.get_group() == Builder::createGroup<1>({4}, {2}, {1})));
+  assert(one_dim.get_group(0) == 1);
+  assert(one_dim.get_group_linear_id() == 1);
+  assert((one_dim.get_group_range() == cl::sycl::id<1>{2}));
+  assert(one_dim.get_group_range(0) == 2);
+  assert((one_dim.get_global_range() == cl::sycl::range<1>{4}));
+  assert((one_dim.get_global_range(0) == 4));
+  assert((one_dim.get_local_range() == cl::sycl::range<1>{2}));
+  assert((one_dim.get_local_range(0) == 2));
+  assert((one_dim.get_offset() == cl::sycl::id<1>{0}));
+  assert((one_dim.get_nd_range() == cl::sycl::nd_range<1>({4}, {2}, {0})));
+
+  // two dimension nd_item
+  cl::sycl::nd_item<2> two_dim = Builder::createNDItem<2>(
+      Builder::createItem<2, true>({4, 4}, {2, 2}, {0, 0}),
+      Builder::createItem<2, false>({2, 2}, {0, 0}),
+      Builder::createGroup<2>({4, 4}, {2, 2}, {1, 1}));
+  assert((two_dim.get_global_id() == cl::sycl::id<2>{2, 2}));
+  assert(two_dim.get_global_id(0) == 2);
+  assert(two_dim.get_global_id(1) == 2);
+  assert(two_dim.get_global_linear_id() == 10);
+  assert((two_dim.get_local_id() == cl::sycl::id<2>{0, 0}));
+  assert(two_dim.get_local_id(0) == 0);
+  assert(two_dim.get_local_id(1) == 0);
+  assert(two_dim.get_local_linear_id() == 0);
+  assert(
+      (two_dim.get_group() == Builder::createGroup<2>({4, 4}, {2, 2}, {1, 1})));
+  assert(two_dim.get_group(0) == 1);
+  assert(two_dim.get_group(1) == 1);
+  assert(two_dim.get_group_linear_id() == 3);
+  assert((two_dim.get_group_range() == cl::sycl::id<2>{2, 2}));
+  assert(two_dim.get_group_range(0) == 2);
+  assert(two_dim.get_group_range(1) == 2);
+  assert((two_dim.get_global_range() == cl::sycl::range<2>{4, 4}));
+  assert((two_dim.get_global_range(0) == 4 &&
+          two_dim.get_global_range(1) == 4));
+  assert((two_dim.get_local_range() == cl::sycl::range<2>{2, 2}));
+  assert((two_dim.get_local_range(0) == 2 &&
+          two_dim.get_local_range(1) == 2));
+  assert((two_dim.get_offset() == cl::sycl::id<2>{0, 0}));
+  assert((two_dim.get_nd_range() ==
+          cl::sycl::nd_range<2>({4, 4}, {2, 2}, {0, 0})));
+
+  // three dimension nd_item
+  cl::sycl::nd_item<3> three_dim = Builder::createNDItem<3>(
+      Builder::createItem<3, true>({4, 4, 4}, {2, 2, 2}, {0, 0, 0}),
+      Builder::createItem<3, false>({2, 2, 2}, {0, 0, 0}),
+      Builder::createGroup<3>({4, 4, 4}, {2, 2, 2}, {1, 1, 1}));
+  assert((three_dim.get_global_id() == cl::sycl::id<3>{2, 2, 2}));
+  assert(three_dim.get_global_id(0) == 2);
+  assert(three_dim.get_global_id(1) == 2);
+  assert(three_dim.get_global_id(2) == 2);
+  assert(three_dim.get_global_linear_id() == 42);
+  assert((three_dim.get_local_id() == cl::sycl::id<3>{0, 0, 0}));
+  assert(three_dim.get_local_id(0) == 0);
+  assert(three_dim.get_local_id(1) == 0);
+  assert(three_dim.get_local_id(2) == 0);
+  assert(three_dim.get_local_linear_id() == 0);
+  assert((three_dim.get_group() ==
+          Builder::createGroup<3>({4, 4, 4}, {2, 2, 2}, {1, 1, 1})));
+  assert(three_dim.get_group(0) == 1);
+  assert(three_dim.get_group(1) == 1);
+  assert(three_dim.get_group(2) == 1);
+  assert(three_dim.get_group_linear_id() == 7);
+  assert((three_dim.get_group_range() == cl::sycl::id<3>{2, 2, 2}));
+  assert(three_dim.get_group_range(0) == 2);
+  assert(three_dim.get_group_range(1) == 2);
+  assert(three_dim.get_group_range(2) == 2);
+  assert((three_dim.get_global_range() == cl::sycl::range<3>{4, 4, 4}));
+  assert((three_dim.get_global_range(0) == 4 &&
+          three_dim.get_global_range(1) == 4 &&
+          three_dim.get_global_range(2) == 4));
+  assert((three_dim.get_local_range() == cl::sycl::range<3>{2, 2, 2}));
+  assert((three_dim.get_local_range(0) == 2 &&
+          three_dim.get_local_range(1) == 2 &&
+          three_dim.get_local_range(2) == 2));
+  assert((three_dim.get_offset() == cl::sycl::id<3>{0, 0, 0}));
+  assert((three_dim.get_nd_range() ==
+          cl::sycl::nd_range<3>({4, 4, 4}, {2, 2, 2}, {0, 0, 0})));
+}
diff --git a/sycl/test/basic_tests/nd_range.cpp b/sycl/test/basic_tests/nd_range.cpp
new file mode 100644
index 000000000000..7b0563155e75
--- /dev/null
+++ b/sycl/test/basic_tests/nd_range.cpp
@@ -0,0 +1,62 @@
+// RUN: %clang --sycl %s -c -o %T/kernel.spv
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: %t.out
+//==--------------- nd_range.cpp - SYCL nd_range test ----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <iostream>
+
+using namespace std;
+int main() {
+  cl::sycl::nd_range<1> one_dim_nd_range_offset({4}, {2}, {1});
+  assert(one_dim_nd_range_offset.get_global_range() == cl::sycl::range<1>(4));
+  assert(one_dim_nd_range_offset.get_local_range() == cl::sycl::range<1>(2));
+  assert(one_dim_nd_range_offset.get_group_range() == cl::sycl::range<1>(2));
+  assert(one_dim_nd_range_offset.get_offset() == cl::sycl::id<1>(1));
+  cout << "one_dim_nd_range_offset passed " << endl;
+
+  cl::sycl::nd_range<2> two_dim_nd_range_offset({8, 16}, {4, 8}, {1, 1});
+  assert(two_dim_nd_range_offset.get_global_range() == cl::sycl::range<2>(8, 16));
+  assert(two_dim_nd_range_offset.get_local_range() == cl::sycl::range<2>(4, 8));
+  assert(two_dim_nd_range_offset.get_group_range() == cl::sycl::range<2>(2, 2));
+  assert(two_dim_nd_range_offset.get_offset() == cl::sycl::id<2>(1, 1));
+  cout << "two_dim_nd_range_offset passed " << endl;
+
+  cl::sycl::nd_range<3> three_dim_nd_range_offset({32, 64, 128}, {16, 32, 64},
+                                                  {1, 1, 1});
+  assert(three_dim_nd_range_offset.get_global_range() ==
+         cl::sycl::range<3>(32, 64, 128));
+  assert(three_dim_nd_range_offset.get_local_range() ==
+         cl::sycl::range<3>(16, 32, 64));
+  assert(three_dim_nd_range_offset.get_group_range() == cl::sycl::range<3>(2, 2, 2));
+  assert(three_dim_nd_range_offset.get_offset() == cl::sycl::id<3>(1, 1, 1));
+  cout << "three_dim_nd_range_offset passed " << endl;
+
+  cl::sycl::nd_range<1> one_dim_nd_range({4}, {2});
+  assert(one_dim_nd_range.get_global_range() == cl::sycl::range<1>(4));
+  assert(one_dim_nd_range.get_local_range() == cl::sycl::range<1>(2));
+  assert(one_dim_nd_range.get_group_range() == cl::sycl::range<1>(2));
+  assert(one_dim_nd_range.get_offset() == cl::sycl::id<1>(0));
+  cout << "one_dim_nd_range passed " << endl;
+
+  cl::sycl::nd_range<2> two_dim_nd_range({8, 16}, {4, 8});
+  assert(two_dim_nd_range.get_global_range() == cl::sycl::range<2>(8, 16));
+  assert(two_dim_nd_range.get_local_range() == cl::sycl::range<2>(4, 8));
+  assert(two_dim_nd_range.get_group_range() == cl::sycl::range<2>(2, 2));
+  assert(two_dim_nd_range.get_offset() == cl::sycl::id<2>(0, 0));
+  cout << "two_dim_nd_range passed " << endl;
+
+  cl::sycl::nd_range<3> three_dim_nd_range({32, 64, 128}, {16, 32, 64} );
+  assert(three_dim_nd_range.get_global_range() == cl::sycl::range<3>(32, 64, 128));
+  assert(three_dim_nd_range.get_local_range() == cl::sycl::range<3>(16, 32, 64));
+  assert(three_dim_nd_range.get_group_range() == cl::sycl::range<3>(2, 2, 2));
+  assert(three_dim_nd_range.get_offset() == cl::sycl::id<3>(0, 0, 0));
+  cout << "three_dim_nd_range passed " << endl;
+}
diff --git a/sycl/test/basic_tests/platform.cpp b/sycl/test/basic_tests/platform.cpp
new file mode 100644
index 000000000000..cda8b9625b2a
--- /dev/null
+++ b/sycl/test/basic_tests/platform.cpp
@@ -0,0 +1,76 @@
+// RUN: %clang --sycl %s -c -o %T/kernel.spv
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: %t.out
+//==--------------- platform.cpp - SYCL platform test ----------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <iostream>
+#include <typeinfo>
+
+using namespace cl::sycl;
+
+int main() {
+  int i = 1;
+  vector_class<platform> openclPlatforms;
+  for (const auto &plt : platform::get_platforms()) {
+    std::cout << "Platform " << i++
+              << " is available: " << ((plt.is_host()) ? "host: " : "OpenCL: ")
+              << std::hex << ((plt.is_host()) ? nullptr : plt.get())
+              << std::endl;
+  }
+
+  auto platforms = platform::get_platforms();
+  platform &platformA = platforms[0];
+  platform &platformB = (platforms.size() > 1 ? platforms[1] : platforms[0]);
+  {
+    std::cout << "move constructor" << std::endl;
+    platform Platform(platformA);
+    size_t hash = hash_class<platform>()(Platform);
+    platform MovedPlatform(std::move(Platform));
+    assert(hash == hash_class<platform>()(MovedPlatform));
+    assert(platformA.is_host() == MovedPlatform.is_host());
+    if (!platformA.is_host()) {
+      assert(MovedPlatform.get() != nullptr);
+    }
+  }
+  {
+    std::cout << "move assignment operator" << std::endl;
+    platform Platform(platformA);
+    size_t hash = hash_class<platform>()(Platform);
+    platform WillMovedPlatform(platformB);
+    WillMovedPlatform = std::move(Platform);
+    assert(hash == hash_class<platform>()(WillMovedPlatform));
+    assert(platformA.is_host() == WillMovedPlatform.is_host());
+    if (!platformA.is_host()) {
+      assert(WillMovedPlatform.get() != nullptr);
+    }
+  }
+  {
+    std::cout << "copy constructor" << std::endl;
+    platform Platform(platformA);
+    size_t hash = hash_class<platform>()(Platform);
+    platform PlatformCopy(Platform);
+    assert(hash == hash_class<platform>()(Platform));
+    assert(hash == hash_class<platform>()(PlatformCopy));
+    assert(Platform == PlatformCopy);
+    assert(Platform.is_host() == PlatformCopy.is_host());
+  }
+  {
+    std::cout << "copy assignment operator" << std::endl;
+    platform Platform(platformA);
+    size_t hash = hash_class<platform>()(Platform);
+    platform WillPlatformCopy(platformB);
+    WillPlatformCopy = Platform;
+    assert(hash == hash_class<platform>()(Platform));
+    assert(hash == hash_class<platform>()(WillPlatformCopy));
+    assert(Platform == WillPlatformCopy);
+    assert(Platform.is_host() == WillPlatformCopy.is_host());
+  }
+}
diff --git a/sycl/test/basic_tests/property_list.cpp b/sycl/test/basic_tests/property_list.cpp
new file mode 100644
index 000000000000..c06023c12ae3
--- /dev/null
+++ b/sycl/test/basic_tests/property_list.cpp
@@ -0,0 +1,71 @@
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+//
+// CHECK: PASSED
+//==--------------- property_list.cpp - SYCL property list test ------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <iostream>
+
+namespace sycl_property = cl::sycl::property;
+
+int main() {
+  bool Failed = false;
+
+  {
+    cl::sycl::property_list Empty{};
+    if (Empty.has_property<sycl_property::buffer::use_host_ptr>()) {
+      std::cerr << "Error: empty property list has property." << std::endl;
+      Failed = true;
+    }
+  }
+
+  {
+    cl::sycl::context SYCLContext;
+    sycl_property::buffer::context_bound ContextBound(SYCLContext);
+
+    cl::sycl::property_list SeveralProps{sycl_property::image::use_host_ptr(),
+                                         sycl_property::buffer::use_host_ptr(),
+                                         sycl_property::image::use_host_ptr(),
+                                         ContextBound};
+
+    if (!SeveralProps.has_property<sycl_property::buffer::use_host_ptr>()) {
+      std::cerr << "Error: property list has no property while should have."
+                << std::endl;
+      Failed = true;
+    }
+
+    if (!SeveralProps.has_property<sycl_property::image::use_host_ptr>()) {
+      std::cerr << "Error: property list has no property while should have."
+                << std::endl;
+      Failed = true;
+    }
+
+    try {
+      sycl_property::buffer::context_bound ContextBoundRet =
+          SeveralProps.get_property<sycl_property::buffer::context_bound>();
+      if (SYCLContext != ContextBoundRet.get_context()) {
+        std::cerr << "Error: returned SYCL context is not the same that was "
+                     "passed to c'tor earlier."
+                  << std::endl;
+        Failed = true;
+      }
+
+    } catch (cl::sycl::invalid_object_error &Error) {
+      Error.what();
+      std::cerr << "Error: exception was thrown in get_property method."
+                << std::endl;
+      Failed = true;
+    }
+  }
+
+  std::cerr << "Test status : " << (Failed ? "FAILED" : "PASSED") << std::endl;
+
+  return Failed;
+}
diff --git a/sycl/test/basic_tests/queue.cpp b/sycl/test/basic_tests/queue.cpp
new file mode 100644
index 000000000000..1be9bb49a36c
--- /dev/null
+++ b/sycl/test/basic_tests/queue.cpp
@@ -0,0 +1,97 @@
+// RUN: %clang --sycl %s -c -o %T/kernel.spv
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+//==--------------- queue.cpp - SYCL queue test ----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <iostream>
+
+using namespace cl::sycl;
+
+string_class get_type(const device &dev) {
+  return ((dev.is_host()) ? "host"
+                          : (dev.is_gpu() ? "OpenCL.GPU" : "OpenCL.CPU"));
+}
+
+void print_queue_info(const queue &q) {
+  std::cout << "ID=" << std::hex
+            << ((q.get_device().is_host()) ? nullptr : q.get()) << std::endl;
+  std::cout << "queue wraps " << get_type(q.get_device()) << " device"
+            << std::endl;
+}
+int main() {
+  try {
+    std::cout << "Create default queue." << std::endl;
+    queue q;
+    print_queue_info(q);
+
+  } catch (device_error e) {
+    std::cout << "Failed to create device for context" << std::endl;
+  }
+
+  auto devices = device::get_devices();
+  device &deviceA = devices[0];
+  device &deviceB = (devices.size() > 1 ? devices[1] : devices[0]);
+  {
+    std::cout << "move constructor" << std::endl;
+    queue Queue(deviceA);
+    size_t hash = hash_class<queue>()(Queue);
+    queue MovedQueue(std::move(Queue));
+    assert(hash == hash_class<queue>()(MovedQueue));
+    assert(deviceA.is_host() == MovedQueue.is_host());
+    if (!deviceA.is_host()) {
+      assert(MovedQueue.get() != nullptr);
+    }
+  }
+  {
+    std::cout << "move assignment operator" << std::endl;
+    queue Queue(deviceA);
+    size_t hash = hash_class<queue>()(Queue);
+    queue WillMovedQueue(deviceB);
+    WillMovedQueue = std::move(Queue);
+    assert(hash == hash_class<queue>()(WillMovedQueue));
+    assert(deviceA.is_host() == WillMovedQueue.is_host());
+    if (!deviceA.is_host()) {
+      assert(WillMovedQueue.get() != nullptr);
+    }
+  }
+  {
+    std::cout << "copy constructor" << std::endl;
+    queue Queue(deviceA);
+    size_t hash = hash_class<queue>()(Queue);
+    queue QueueCopy(Queue);
+    assert(hash == hash_class<queue>()(Queue));
+    assert(hash == hash_class<queue>()(QueueCopy));
+    assert(Queue == QueueCopy);
+    assert(Queue.is_host() == QueueCopy.is_host());
+  }
+  {
+    std::cout << "copy assignment operator" << std::endl;
+    queue Queue(deviceA);
+    size_t hash = hash_class<queue>()(Queue);
+    queue WillQueueCopy(deviceB);
+    WillQueueCopy = Queue;
+    assert(hash == hash_class<queue>()(Queue));
+    assert(hash == hash_class<queue>()(WillQueueCopy));
+    assert(Queue == WillQueueCopy);
+    assert(Queue.is_host() == WillQueueCopy.is_host());
+  }
+
+  {
+    property_list pl = {};
+    queue Queue(pl);
+    try {
+      Queue.throw_asynchronous();
+    }
+    catch (const std::bad_function_call& e) {
+      std::cout << "Default asynchronous handler call failed: " << e.what() << std::endl;
+      throw;
+    }
+  }
+}
diff --git a/sycl/test/basic_tests/range.cpp b/sycl/test/basic_tests/range.cpp
new file mode 100644
index 000000000000..00793deefe3d
--- /dev/null
+++ b/sycl/test/basic_tests/range.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang --sycl %s -c -o %T/kernel.spv
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: %t.out
+//==--------------- range.cpp - SYCL range test ----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <cassert>
+
+using namespace std;
+int main() {
+  cl::sycl::range<1> one_dim_range(64);
+  cl::sycl::range<2> two_dim_range(64, 1);
+  cl::sycl::range<3> three_dim_range(64, 1, 2);
+  assert(one_dim_range.size() ==64);
+  assert(one_dim_range.get(0) ==64);
+  assert(one_dim_range[0] ==64);
+  cout << "one_dim_range passed " << endl;
+  assert(two_dim_range.size() ==64);
+  assert(two_dim_range.get(0) ==64);
+  assert(two_dim_range[0] ==64);
+  assert(two_dim_range.get(1) ==1);
+  assert(two_dim_range[1] ==1);
+  cout << "two_dim_range passed " << endl;
+  assert(three_dim_range.size() ==128);
+  assert(three_dim_range.get(0) ==64);
+  assert(three_dim_range[0] ==64);
+  assert(three_dim_range.get(1) ==1);
+  assert(three_dim_range[1] ==1);
+  assert(three_dim_range.get(2) ==2);
+  assert(three_dim_range[2] ==2);
+  cout << "three_dim_range passed " << endl;
+}
diff --git a/sycl/test/basic_tests/range_error.cpp b/sycl/test/basic_tests/range_error.cpp
new file mode 100644
index 000000000000..5e7cd271c5ae
--- /dev/null
+++ b/sycl/test/basic_tests/range_error.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang -std=c++11 -Xclang -verify %s -Xclang -verify-ignore-unexpected=note,warning -fsyntax-only
+//==--------------- range_error.cpp - SYCL range error test ----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <cassert>
+
+using namespace std;
+int main() {
+  cl::sycl::range<1> one_dim_range(64);
+  cl::sycl::range<2> two_dim_range(64, 1);
+  cl::sycl::range<3> three_dim_range(64, 1, 2);
+  assert(one_dim_range.size() ==64);
+  assert(one_dim_range.get(0) ==64);
+  assert(one_dim_range[0] ==64);
+  cout << "one_dim_range passed " << endl;
+  assert(two_dim_range.size() ==64);
+  assert(two_dim_range.get(0) ==64);
+  assert(two_dim_range[0] ==64);
+  assert(two_dim_range.get(1) ==1);
+  assert(two_dim_range[1] ==1);
+  cout << "two_dim_range passed " << endl;
+  assert(three_dim_range.size() ==128);
+  assert(three_dim_range.get(0) ==64);
+  assert(three_dim_range[0] ==64);
+  assert(three_dim_range.get(1) ==1);
+  assert(three_dim_range[1] ==1);
+  assert(three_dim_range.get(2) ==2);
+  assert(three_dim_range[2] ==2);
+  cout << "three_dim_range passed " << endl;
+  cl::sycl::range<1> one_dim_range_f1(64, 2, 4);//expected-error {{no matching constructor for initialization of 'cl::sycl::range<1>'}}
+  cl::sycl::range<2> two_dim_range_f1(64);//expected-error {{no matching constructor for initialization of 'cl::sycl::range<2>'}}
+}
diff --git a/sycl/test/basic_tests/subdevice.cpp b/sycl/test/basic_tests/subdevice.cpp
new file mode 100644
index 000000000000..5a55ac9b9527
--- /dev/null
+++ b/sycl/test/basic_tests/subdevice.cpp
@@ -0,0 +1,149 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==------------ subdevice.cpp - SYCL subdevice basic test -----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <utility>
+
+using namespace cl::sycl;
+
+int main() {
+  try {
+    auto devices = device::get_devices();
+    for (const auto &dev : devices) {
+      // TODO: implement subdevices creation for host device
+      if (dev.is_host())
+        continue;
+
+      assert(dev.get_info<info::device::partition_type_property>() ==
+             info::partition_property::no_partition);
+
+      size_t MaxSubDevices =
+          dev.get_info<info::device::partition_max_sub_devices>();
+      if (MaxSubDevices == 0)
+        continue;
+
+      try {
+        auto SubDevicesEq =
+            dev.create_sub_devices<info::partition_property::partition_equally>(
+                1);
+        assert(SubDevicesEq.size() == MaxSubDevices &&
+               "Requested 1 compute unit in each subdevice, expected maximum "
+               "number of subdevices in output");
+        std::cout << "Created " << SubDevicesEq.size()
+                  << " subdevices using equal partition scheme" << std::endl;
+
+        assert(
+            SubDevicesEq[0].get_info<info::device::partition_type_property>() ==
+            info::partition_property::partition_equally);
+
+        assert(SubDevicesEq[0].get_info<info::device::parent_device>().get() ==
+               dev.get());
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        vector_class<size_t> Counts(MaxSubDevices, 1);
+        auto SubDevicesByCount = dev.create_sub_devices<
+            info::partition_property::partition_by_counts>(Counts);
+        assert(SubDevicesByCount.size() == MaxSubDevices &&
+               "Maximum number of subdevices was requested with 1 compute unit "
+               "on each");
+        std::cout << "Created " << SubDevicesByCount.size()
+                  << " subdevices using partition by counts scheme."
+                  << std::endl;
+        assert(SubDevicesByCount[0]
+                   .get_info<info::device::partition_type_property>() ==
+               info::partition_property::partition_by_counts);
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        auto SubDevicesDomainNuma = dev.create_sub_devices<
+            info::partition_property::partition_by_affinity_domain>(
+            info::partition_affinity_domain::numa);
+        std::cout
+            << "Created " << SubDevicesDomainNuma.size()
+            << " subdevices using partition by numa affinity domain scheme."
+            << std::endl;
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        auto SubDevicesDomainL4 = dev.create_sub_devices<
+            info::partition_property::partition_by_affinity_domain>(
+            info::partition_affinity_domain::L4_cache);
+        std::cout << "Created " << SubDevicesDomainL4.size()
+                  << " subdevices using partition by L4 cache domain scheme."
+                  << std::endl;
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        auto SubDevicesDomainL3 = dev.create_sub_devices<
+            info::partition_property::partition_by_affinity_domain>(
+            info::partition_affinity_domain::L3_cache);
+        std::cout << "Created " << SubDevicesDomainL3.size()
+                  << " subdevices using partition by L3 cache domain scheme."
+                  << std::endl;
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        auto SubDevicesDomainL2 = dev.create_sub_devices<
+            info::partition_property::partition_by_affinity_domain>(
+            info::partition_affinity_domain::L2_cache);
+        std::cout << "Created " << SubDevicesDomainL2.size()
+                  << " subdevices using partition by L2 cache domain scheme."
+                  << std::endl;
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        auto SubDevicesDomainL1 = dev.create_sub_devices<
+            info::partition_property::partition_by_affinity_domain>(
+            info::partition_affinity_domain::L1_cache);
+        std::cout << "Created " << SubDevicesDomainL1.size()
+                  << " subdevices using partition by L1 cache domain scheme."
+                  << std::endl;
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+
+      try {
+        auto SubDevicesDomainNextPart = dev.create_sub_devices<
+            info::partition_property::partition_by_affinity_domain>(
+            info::partition_affinity_domain::next_partitionable);
+        std::cout << "Created " << SubDevicesDomainNextPart.size()
+                  << " subdevices using partition by next partitionable "
+                     "domain scheme."
+                  << std::endl;
+      } catch (feature_not_supported) {
+        // okay skip it
+      }
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what() << std::endl;
+    return 1;
+  }
+  return 0;
+}
diff --git a/sycl/test/basic_tests/swizzle_op.cpp b/sycl/test/basic_tests/swizzle_op.cpp
new file mode 100644
index 000000000000..5af41e58a173
--- /dev/null
+++ b/sycl/test/basic_tests/swizzle_op.cpp
@@ -0,0 +1,231 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUNx: %CPU_RUN_PLACEHOLDER %t.out
+// RUNx: %GPU_RUN_PLACEHOLDER %t.out
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+//==------------ swizzle_op.cpp - SYCL SwizzleOp basic test ----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#define SYCL_SIMPLE_SWIZZLES
+
+#include <CL/sycl.hpp>
+#include <cassert>
+
+using namespace cl::sycl;
+
+int main() {
+  {
+    cl::sycl::cl_float results[3] = {0};
+    {
+      buffer<cl::sycl::cl_float, 1> b(results, range<1>(3));
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::write>(cgh);
+        cgh.single_task<class test_1>([=]() {
+          cl::sycl::cl_float2 ab = {4, 2};
+          cl::sycl::cl_float c = ab.x() * ab.y();
+          B[0] = ab.x();
+          B[1] = ab.y();
+          B[2] = c;
+        });
+      });
+    }
+    assert(results[0] == 4);
+    assert(results[1] == 2);
+    assert(results[2] == 8);
+  }
+
+  {
+    cl::sycl::cl_float results[3] = {0};
+    {
+      buffer<cl::sycl::cl_float, 1> b(results, range<1>(3));
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::write>(cgh);
+        cgh.single_task<class test_2>([=]() {
+          cl::sycl::cl_float2 ab = {4, 2};
+          cl::sycl::cl_float c = ab.x() * 2;
+          B[0] = ab.x();
+          B[1] = ab.y();
+          B[2] = c;
+        });
+      });
+    }
+    assert(results[0] == 4);
+    assert(results[1] == 2);
+    assert(results[2] == 8);
+  }
+
+  {
+    cl::sycl::cl_float results[3] = {0};
+    {
+      buffer<cl::sycl::cl_float, 1> b(results, range<1>(3));
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::write>(cgh);
+        cgh.single_task<class test_3>([=]() {
+          cl::sycl::cl_float2 ab = {4, 2};
+          cl::sycl::cl_float c = 4 * ab.y();
+          B[0] = ab.x();
+          B[1] = ab.y();
+          B[2] = c;
+        });
+      });
+    }
+    assert(results[0] == 4);
+    assert(results[1] == 2);
+    assert(results[2] == 8);
+  }
+
+  {
+    cl::sycl::cl_float results[4] = {0};
+    {
+      buffer<cl::sycl::cl_float, 1> b(results, range<1>(4));
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::write>(cgh);
+        cgh.single_task<class test_4>([=]() {
+          cl::sycl::cl_float2 ab = {4, 2};
+          cl::sycl::cl_float2 c = {0, 0};
+          c.x() = ab.x() * ab.y();
+          B[0] = ab.x();
+          B[1] = ab.y();
+          B[2] = c.x();
+          B[4] = c.y();
+        });
+      });
+    }
+    assert(results[0] == 4);
+    assert(results[1] == 2);
+    assert(results[2] == 8);
+    assert(results[3] == 0);
+  }
+
+  {
+    cl::sycl::cl_float results[4] = {0};
+    {
+      buffer<cl::sycl::cl_float, 1> b(results, range<1>(4));
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::write>(cgh);
+        cgh.single_task<class test_5>([=]() {
+          cl::sycl::cl_float2 ab = {4, 2};
+          cl::sycl::cl_float2 c = {0, 0};
+          c.x() = 4 * ab.y();
+          B[0] = ab.x();
+          B[1] = ab.y();
+          B[2] = c.x();
+          B[4] = c.y();
+        });
+      });
+    }
+    assert(results[0] == 4);
+    assert(results[1] == 2);
+    assert(results[2] == 8);
+    assert(results[3] == 0);
+  }
+
+  {
+    cl::sycl::cl_float results[4] = {0};
+    {
+      buffer<cl::sycl::cl_float, 1> b(results, range<1>(4));
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::write>(cgh);
+        cgh.single_task<class test_6>([=]() {
+          cl::sycl::cl_float2 ab = {4, 2};
+          cl::sycl::cl_float2 c = {0, 0};
+          c.x() = ab.x() * 2;
+          B[0] = ab.x();
+          B[1] = ab.y();
+          B[2] = c.x();
+          B[4] = c.y();
+        });
+      });
+    }
+    assert(results[0] == 4);
+    assert(results[1] == 2);
+    assert(results[2] == 8);
+    assert(results[3] == 0);
+  }
+
+  {
+    cl::sycl::cl_float results[6] = {0};
+    {
+      buffer<cl::sycl::cl_float, 1> b(results, range<1>(6));
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::write>(cgh);
+        cgh.single_task<class test_7>([=]() {
+          cl::sycl::uchar4 abc = {4, 2, 1, 0};
+
+          cl::sycl::uchar4 c_each;
+          c_each.x() = abc.x();
+          c_each.y() = abc.y();
+          c_each.z() = abc.z();
+
+          cl::sycl::uchar4 c_full;
+          c_full = abc;
+
+          B[0] = c_each.x();
+          B[1] = c_each.y();
+          B[2] = c_each.z();
+          B[3] = c_full.x();
+          B[4] = c_full.y();
+          B[5] = c_full.z();
+        });
+      });
+    }
+    assert(results[0] == 4);
+    assert(results[1] == 2);
+    assert(results[2] == 1);
+    assert(results[3] == 4);
+    assert(results[4] == 2);
+    assert(results[5] == 1);
+  }
+
+  {
+    cl::sycl::cl_float results[4] = {0};
+    {
+      buffer<cl::sycl::cl_float, 1> b(results, range<1>(4));
+      queue myQueue;
+      myQueue.submit([&](handler &cgh) {
+        auto B = b.get_access<access::mode::write>(cgh);
+        cgh.single_task<class test_8>([=]() {
+          cl::sycl::uchar4 cba;
+          cl::sycl::uchar x = 1;
+          cl::sycl::uchar y = 2;
+          cl::sycl::uchar z = 3;
+          cl::sycl::uchar w = 4;
+          cba.x() = x;
+          cba.y() = y;
+          cba.z() = z;
+          cba.w() = w;
+
+          cl::sycl::uchar4 abc = {1, 2, 3, 4};
+          abc.x() = cba.s0();
+          abc.y() = cba.s1();
+          abc.z() = cba.s2();
+          abc.w() = cba.s3();
+          if ((cba.x() == abc.x())) {
+            abc.xy() = abc.xy() * 3;
+
+            B[0] = abc.x();
+            B[1] = abc.y();
+            B[2] = abc.z();
+            B[3] = abc.w();
+          }
+        });
+      });
+    }
+    assert(results[0] == 3);
+    assert(results[1] == 6);
+    assert(results[2] == 3);
+    assert(results[3] == 4);
+  }
+}
diff --git a/sycl/test/basic_tests/types.cpp b/sycl/test/basic_tests/types.cpp
new file mode 100644
index 000000000000..cf6358f558cb
--- /dev/null
+++ b/sycl/test/basic_tests/types.cpp
@@ -0,0 +1,84 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+//==--------------- types.cpp - SYCL types test ----------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <cassert>
+#include <iostream>
+#include <type_traits>
+
+using namespace std;
+
+using cl_schar = cl_char;
+using cl_schar4 = cl_char4;
+
+#define CHECK_TYPE(type)                                                       \
+  static_assert(sizeof(cl_##type) == sizeof(cl::sycl::cl_##type), "Wrong "     \
+                                                                  "size")
+
+#define CHECK_SIZE(T, S) static_assert(sizeof(T) == S, "Wrong size of type");
+
+#define CHECK_SIZE_TYPE_I(T, S)                                                \
+  CHECK_SIZE(T, S)                                                             \
+  static_assert(std::is_signed<T>::value, "Expected signed type");
+
+#define CHECK_SIZE_TYPE_UI(T, S)                                               \
+  CHECK_SIZE(T, S)                                                             \
+  static_assert(std::is_unsigned<T>::value, "Expected unsigned type");
+
+#define CHECK_SIZE_TYPE_F(T, S)                                                \
+  CHECK_SIZE(T, S)                                                             \
+  static_assert(std::numeric_limits<T>::is_iec559,                             \
+                "Expected type conformed to the IEEE 754");
+
+int main() {
+  CHECK_TYPE(bool);
+  CHECK_TYPE(char);
+  CHECK_TYPE(schar);
+  CHECK_TYPE(uchar);
+  CHECK_TYPE(short);
+  CHECK_TYPE(ushort);
+  CHECK_TYPE(half);
+  CHECK_TYPE(int);
+  CHECK_TYPE(uint);
+  CHECK_TYPE(long);
+  CHECK_TYPE(ulong);
+  CHECK_TYPE(float);
+  CHECK_TYPE(double);
+  CHECK_TYPE(bool);
+  CHECK_TYPE(char2);
+  CHECK_TYPE(uchar3);
+  CHECK_TYPE(short4);
+  CHECK_TYPE(ushort8);
+  CHECK_TYPE(half16);
+  CHECK_TYPE(int2);
+  CHECK_TYPE(uint3);
+  CHECK_TYPE(long4);
+  CHECK_TYPE(schar4);
+  CHECK_TYPE(ulong8);
+  CHECK_TYPE(float16);
+  CHECK_TYPE(double2);
+
+  // Table 4.93: Scalar data type aliases supported by SYCL
+  CHECK_SIZE_TYPE_UI(cl::sycl::byte, 1);
+
+  CHECK_SIZE_TYPE_I(cl::sycl::cl_char, 1);
+  CHECK_SIZE_TYPE_I(cl::sycl::cl_short, 2);
+  CHECK_SIZE_TYPE_I(cl::sycl::cl_int, 4);
+  CHECK_SIZE_TYPE_I(cl::sycl::cl_long, 8);
+
+  CHECK_SIZE_TYPE_UI(cl::sycl::cl_uchar, 1);
+  CHECK_SIZE_TYPE_UI(cl::sycl::cl_ushort, 2);
+  CHECK_SIZE_TYPE_UI(cl::sycl::cl_uint, 4);
+  CHECK_SIZE_TYPE_UI(cl::sycl::cl_ulong, 8);
+
+  CHECK_SIZE_TYPE_F(cl::sycl::cl_float, 4);
+  CHECK_SIZE_TYPE_F(cl::sycl::cl_double, 8);
+  // CHECK_SIZE_TYPE_F(cl::sycl::cl_half, 2);
+}
diff --git a/sycl/test/basic_tests/vectors.cpp b/sycl/test/basic_tests/vectors.cpp
new file mode 100644
index 000000000000..132c862d3afa
--- /dev/null
+++ b/sycl/test/basic_tests/vectors.cpp
@@ -0,0 +1,55 @@
+// RUN: %clang --sycl %s -c -o %T/kernel.spv
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: %t.out
+//==--------------- vectors.cpp - SYCL vectors test ------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define SYCL_SIMPLE_SWIZZLES
+#include <CL/sycl.hpp>
+using namespace cl::sycl;
+
+void check_vectors(int4 a, int4 b, int4 c, int4 gold) {
+  int4 result = a * (int4)b.y() + c;
+  assert((int)result.x() == (int)gold.x());
+  assert((int)result.y() == (int)gold.y());
+  assert((int)result.w() == (int)gold.w());
+  assert((int)result.z() == (int)gold.z());
+}
+
+int main() {
+  int4 a = {1, 2, 3, 4};
+  const int4 b = {10, 20, 30, 40};
+  const int4 gold = {21, 42, 90, 120};
+  const int2 a_xy = a.xy();
+  check_vectors(a, b, {1, 2, 30, 40}, gold);
+  check_vectors(a, b, {a.x(), a.y(), b.z(), b.w()}, gold);
+  check_vectors(a, b, {a.x(), 2, b.z(), 40}, gold);
+  check_vectors(a, b, {a.x(), 2, b.zw()}, gold);
+  check_vectors(a, b, {a_xy, b.z(), 40}, gold);
+  check_vectors(a, b, {a.xy(), b.zw()}, gold);
+
+  // Constructing vector from a scalar
+  cl::sycl::vec<int, 1> vec_from_one_elem(1);
+
+  // implicit conversion
+  cl::sycl::vec<unsigned char, 2> vec_2(1, 2);
+  cl::sycl::vec<unsigned char, 4> vec_4(0, vec_2, 3);
+
+  assert(vec_4.get_count() == 4);
+  assert(static_cast<unsigned char>(vec_4.x()) == static_cast<unsigned char>(0));
+  assert(static_cast<unsigned char>(vec_4.y()) == static_cast<unsigned char>(1));
+  assert(static_cast<unsigned char>(vec_4.z()) == static_cast<unsigned char>(2));
+  assert(static_cast<unsigned char>(vec_4.w()) == static_cast<unsigned char>(3));
+
+  // explicit conversion
+  int64_t(vec_2.x());
+  cl::sycl::int4(vec_2.x());
+
+  return 0;
+}
diff --git a/sycl/test/functor/kernel_functor.cpp b/sycl/test/functor/kernel_functor.cpp
new file mode 100644
index 000000000000..214e57ea0d3d
--- /dev/null
+++ b/sycl/test/functor/kernel_functor.cpp
@@ -0,0 +1,186 @@
+// RUN: %clang -fsycl -o %t.out %s -lstdc++ -lOpenCL -lsycl
+// RUN: cd %T
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out | FileCheck %s
+// RUNx: %CPU_RUN_PLACEHOLDER %t.out
+// RUNx: %GPU_RUN_PLACEHOLDER %t.out
+// CHECK:Passed.
+
+//==--- kernel_functor.cpp - Functors as SYCL kernel test ------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <iostream>
+
+constexpr auto sycl_read_write = cl::sycl::access::mode::read_write;
+constexpr auto sycl_global_buffer = cl::sycl::access::target::global_buffer;
+
+// Case 1:
+// - functor class is defined in an anonymous namespace
+// - the '()' operator:
+//   * does not have parameters (to be used in 'single_task').
+//   * has no 'const' qualifier
+namespace {
+class Functor1 {
+public:
+  Functor1(
+      int X_,
+      cl::sycl::accessor<int, 1, sycl_read_write, sycl_global_buffer> &Acc_)
+      : X(X_), Acc(Acc_) {}
+
+  void operator()() { Acc[0] += X; }
+
+private:
+  int X;
+  cl::sycl::accessor<int, 1, sycl_read_write, sycl_global_buffer> Acc;
+};
+}
+
+// Case 2:
+// - functor class is defined in a namespace
+// - the '()' operator:
+//   * does not have parameters (to be used in 'single_task').
+//   * has the 'const' qualifier
+namespace ns {
+class Functor2 {
+public:
+  Functor2(
+      int X_,
+      cl::sycl::accessor<int, 1, sycl_read_write, sycl_global_buffer> &Acc_)
+      : X(X_), Acc(Acc_) {}
+
+  // cl::sycl::accessor's operator [] is const, hence 'const' is possible below
+  void operator()() const { Acc[0] += X; }
+
+private:
+  int X;
+  cl::sycl::accessor<int, 1, sycl_read_write, sycl_global_buffer> Acc;
+};
+}
+
+// Case 3:
+// - functor class is templated and defined in the translation unit scope
+// - the '()' operator:
+//   * has a parameter of type cl::sycl::id<1> (to be used in 'parallel_for').
+//   * has no 'const' qualifier
+template <typename T> class TmplFunctor {
+public:
+  TmplFunctor(
+      T X_, cl::sycl::accessor<T, 1, sycl_read_write, sycl_global_buffer> &Acc_)
+      : X(X_), Acc(Acc_) {}
+
+  void operator()(cl::sycl::id<1> id) { Acc[id] += X; }
+
+private:
+  T X;
+  cl::sycl::accessor<T, 1, sycl_read_write, sycl_global_buffer> Acc;
+};
+
+// Case 4:
+// - functor class is templated and defined in the translation unit scope
+// - the '()' operator:
+//   * has a parameter of type cl::sycl::id<1> (to be used in 'parallel_for').
+//   * has the 'const' qualifier
+template <typename T> class TmplConstFunctor {
+public:
+  TmplConstFunctor(
+      T X_, cl::sycl::accessor<T, 1, sycl_read_write, sycl_global_buffer> &Acc_)
+      : X(X_), Acc(Acc_) {}
+
+  void operator()(cl::sycl::id<1> id) const { Acc[id] += X; }
+
+private:
+  T X;
+  cl::sycl::accessor<T, 1, sycl_read_write, sycl_global_buffer> Acc;
+};
+
+// Exercise non-templated functors in 'single_task'.
+int foo(int X) {
+  int A[] = { 10 };
+  {
+    cl::sycl::queue Q;
+    cl::sycl::buffer<int, 1> Buf(A, 1);
+
+    Q.submit([&](cl::sycl::handler &cgh) {
+      auto Acc = Buf.get_access<sycl_read_write, sycl_global_buffer>(cgh);
+      Functor1 F(X, Acc);
+
+      cgh.single_task(F);
+    });
+    Q.submit([&](cl::sycl::handler &cgh) {
+      auto Acc = Buf.get_access<sycl_read_write, sycl_global_buffer>(cgh);
+      ns::Functor2 F(X, Acc);
+
+      cgh.single_task(F);
+    });
+    Q.submit([&](cl::sycl::handler &cgh) {
+      auto Acc = Buf.get_access<sycl_read_write, sycl_global_buffer>(cgh);
+      ns::Functor2 F(X, Acc);
+
+      cgh.single_task(F);
+    });
+  }
+  return A[0];
+}
+
+#define ARR_LEN(x) sizeof(x) / sizeof(x[0])
+
+// Exercise templated functors in 'parallel_for'.
+template <typename T> T bar(T X) {
+  T A[] = {(T)10, (T)10 };
+  {
+    cl::sycl::queue Q;
+    cl::sycl::buffer<T, 1> Buf(A, ARR_LEN(A));
+
+    Q.submit([&](cl::sycl::handler &cgh) {
+      auto Acc =
+          Buf.template get_access<sycl_read_write, sycl_global_buffer>(cgh);
+      TmplFunctor<T> F(X, Acc);
+
+      cgh.parallel_for(cl::sycl::range<1>(ARR_LEN(A)), F);
+    });
+    // Spice with lambdas to make sure functors and lambdas work together.
+    Q.submit([&](cl::sycl::handler &cgh) {
+      auto Acc =
+          Buf.template get_access<sycl_read_write, sycl_global_buffer>(cgh);
+      cgh.parallel_for<class LambdaKernel>(
+          cl::sycl::range<1>(ARR_LEN(A)),
+          [=](cl::sycl::id<1> id) { Acc[id] += X; });
+    });
+    Q.submit([&](cl::sycl::handler &cgh) {
+      auto Acc =
+          Buf.template get_access<sycl_read_write, sycl_global_buffer>(cgh);
+      TmplConstFunctor<T> F(X, Acc);
+
+      cgh.parallel_for(cl::sycl::range<1>(ARR_LEN(A)), F);
+    });
+  }
+  T res = (T)0;
+
+  for (int i = 0; i < ARR_LEN(A); i++)
+    res += A[i];
+  return res;
+}
+
+int main() {
+  const int Res1 = foo(10);
+  const int Res2 = bar(10);
+  const int Gold1 = 40;
+  const int Gold2 = 80;
+
+  if (Res1 != Gold1) {
+    std::cout << "FAILED. " << Res1 << "!=" << Gold1 << "\n";
+    return 1;
+  }
+  if (Res2 != Gold2) {
+    std::cout << "FAILED. " << Res2 << "!=" << Gold2 << "\n";
+    return 1;
+  }
+  std::cout << "Passed.\n";
+  return 0;
+}
diff --git a/sycl/test/kernel-and-program/kernel-and-program.cpp b/sycl/test/kernel-and-program/kernel-and-program.cpp
new file mode 100644
index 000000000000..e2da06771f0a
--- /dev/null
+++ b/sycl/test/kernel-and-program/kernel-and-program.cpp
@@ -0,0 +1,264 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==--- kernel-and-program.cpp - SYCL kernel/program test ------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <utility>
+
+int main() {
+
+  // Single task invocation methods
+  {
+    cl::sycl::queue q;
+    int data = 0;
+    // Precompiled kernel invocation
+    {
+      cl::sycl::buffer<int, 1> buf(&data, cl::sycl::range<1>(1));
+      cl::sycl::program prg(q.get_context());
+      // Test program building
+      assert(prg.get_state() == cl::sycl::program_state::none);
+      prg.build_with_kernel_type<class SingleTask>();
+      assert(prg.get_state() == cl::sycl::program_state::linked);
+      assert(prg.has_kernel<class SingleTask>());
+      cl::sycl::kernel krn = prg.get_kernel<class SingleTask>();
+      assert(krn.get_context() == q.get_context());
+      assert(krn.get_program() == prg);
+
+      q.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<class SingleTask>(krn, [=]() { acc[0] = acc[0] + 1; });
+      });
+    }
+    assert(data == 1);
+
+    // OpenCL interoperability kernel invocation
+    // TODO add set_args(cl::sycl::sampler) use case once it's supported
+    if (!q.is_host()) {
+      cl_int err;
+      if (0) {
+        cl::sycl::context ctx = q.get_context();
+        cl_context clCtx = ctx.get();
+        cl_command_queue clQ = q.get();
+        cl_mem clBuffer =
+            clCreateBuffer(clCtx, CL_MEM_WRITE_ONLY, sizeof(int), NULL, NULL);
+        err = clEnqueueWriteBuffer(clQ, clBuffer, CL_TRUE, 0, sizeof(int),
+                                   &data, 0, NULL, NULL);
+        // Kernel interoperability constructor
+        assert(err == CL_SUCCESS);
+        cl::sycl::program prog(ctx);
+        prog.build_with_source(
+            "kernel void SingleTask(global int* a) {*a+=1; }\n");
+        q.submit([&](cl::sycl::handler &cgh) {
+          cgh.set_args(clBuffer);
+          cgh.single_task(prog.get_kernel("SingleTask"));
+        });
+        q.wait();
+        err = clEnqueueReadBuffer(clQ, clBuffer, CL_TRUE, 0, sizeof(int), &data,
+                                  0, NULL, NULL);
+        clReleaseCommandQueue(clQ);
+        clReleaseContext(clCtx);
+        assert(err == CL_SUCCESS);
+        assert(data == 2);
+      }
+      {
+        cl::sycl::queue sycl_queue;
+        cl::sycl::program prog(sycl_queue.get_context());
+        prog.build_with_source("kernel void foo(global int* a, global int* b, "
+                               "global int* c) {*a=*b+*c; }\n");
+        int a = 13, b = 14, c = 15;
+        {
+          cl::sycl::buffer<int, 1> bufa(&a, cl::sycl::range<1>(1));
+          cl::sycl::buffer<int, 1> bufb(&b, cl::sycl::range<1>(1));
+          cl::sycl::buffer<int, 1> bufc(&c, cl::sycl::range<1>(1));
+          sycl_queue.submit([&](cl::sycl::handler &cgh) {
+            auto A = bufa.get_access<cl::sycl::access::mode::write>(cgh);
+            auto B = bufb.get_access<cl::sycl::access::mode::read>(cgh);
+            auto C = bufc.get_access<cl::sycl::access::mode::read>(cgh);
+            cgh.set_args(A, B, C);
+            cgh.single_task(prog.get_kernel("foo"));
+          });
+        }
+        assert(a == b + c);
+      }
+    }
+  }
+  // Parallel for with range
+  {
+    cl::sycl::queue q;
+    std::vector<int> dataVec(10);
+    std::iota(dataVec.begin(), dataVec.end(), 0);
+    // Precompiled kernel invocation
+    {
+      cl::sycl::range<1> numOfItems(dataVec.size());
+      cl::sycl::buffer<int, 1> buf(dataVec.data(), numOfItems);
+      cl::sycl::program prg(q.get_context());
+      assert(prg.get_state() == cl::sycl::program_state::none);
+      // Test compiling -> linking
+      prg.compile_with_kernel_type<class ParallelFor>();
+      assert(prg.get_state() == cl::sycl::program_state::compiled);
+      prg.link();
+      assert(prg.get_state() == cl::sycl::program_state::linked);
+      assert(prg.has_kernel<class ParallelFor>());
+      cl::sycl::kernel krn = prg.get_kernel<class ParallelFor>();
+      assert(krn.get_context() == q.get_context());
+      assert(krn.get_program() == prg);
+
+      q.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.parallel_for<class ParallelFor>(
+            numOfItems, krn,
+            [=](cl::sycl::id<1> wiID) { acc[wiID] = acc[wiID] + 1; });
+      });
+    }
+    for (size_t i = 0; i < dataVec.size(); ++i) {
+      assert(dataVec[i] == i + 1);
+    }
+
+    // OpenCL interoperability kernel invocation
+    if (!q.is_host()) {
+      cl_int err;
+      {
+        cl::sycl::context ctx = q.get_context();
+        cl_context clCtx = ctx.get();
+        cl_command_queue clQ = q.get();
+        cl_mem clBuffer = clCreateBuffer(
+            clCtx, CL_MEM_WRITE_ONLY, sizeof(int) * dataVec.size(), NULL, NULL);
+        err = clEnqueueWriteBuffer(clQ, clBuffer, CL_TRUE, 0,
+                                   sizeof(int) * dataVec.size(), dataVec.data(),
+                                   0, NULL, NULL);
+        assert(err == CL_SUCCESS);
+
+        cl::sycl::program prog(ctx);
+        prog.build_with_source("kernel void ParallelFor(global int* a) "
+                               "{a[get_global_id(0)]+=1; }\n");
+
+        q.submit([&](cl::sycl::handler &cgh) {
+          cgh.set_args(clBuffer);
+          cgh.parallel_for(cl::sycl::range<1>(10),
+                           prog.get_kernel("ParallelFor"));
+        });
+
+        q.wait();
+        err = clEnqueueReadBuffer(clQ, clBuffer, CL_TRUE, 0,
+                                  sizeof(int) * dataVec.size(), dataVec.data(),
+                                  0, NULL, NULL);
+        clReleaseCommandQueue(clQ);
+        clReleaseContext(clCtx);
+        assert(err == CL_SUCCESS);
+        for (size_t i = 0; i < dataVec.size(); ++i) {
+          assert(dataVec[i] == i + 2);
+        }
+      }
+    }
+  }
+
+  // Parallel for with nd_range
+  {
+    cl::sycl::queue q;
+    std::vector<int> dataVec(10);
+    std::iota(dataVec.begin(), dataVec.end(), 0);
+
+    // Precompiled kernel invocation
+    // TODO run on host as well once local barrier is supported
+    if (!q.is_host()) {
+      {
+        cl::sycl::range<1> numOfItems(dataVec.size());
+        cl::sycl::range<1> localRange(2);
+        cl::sycl::buffer<int, 1> buf(dataVec.data(), numOfItems);
+        cl::sycl::program prg(q.get_context());
+        assert(prg.get_state() == cl::sycl::program_state::none);
+        prg.build_with_kernel_type<class ParallelForND>();
+        assert(prg.get_state() == cl::sycl::program_state::linked);
+        assert(prg.has_kernel<class ParallelForND>());
+        cl::sycl::kernel krn = prg.get_kernel<class ParallelForND>();
+        assert(krn.get_context() == q.get_context());
+        assert(krn.get_program() == prg);
+
+        q.submit([&](cl::sycl::handler &cgh) {
+          auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+          cl::sycl::accessor<int, 1, cl::sycl::access::mode::read_write,
+                             cl::sycl::access::target::local>
+              localAcc(localRange, cgh);
+
+          cgh.parallel_for<class ParallelForND>(
+              cl::sycl::nd_range<1>(numOfItems, localRange), krn,
+              [=](cl::sycl::nd_item<1> item) {
+                size_t idx = item.get_global_linear_id();
+                int pos = idx & 1;
+                int opp = pos ^ 1;
+                localAcc[pos] = acc[item.get_global_linear_id()];
+
+                item.barrier(cl::sycl::access::fence_space::local_space);
+
+                acc[idx] = localAcc[opp];
+              });
+        });
+      }
+      q.wait();
+      for (size_t i = 0; i < dataVec.size(); ++i) {
+        assert(dataVec[i] == (i ^ 1));
+      }
+    }
+
+    // OpenCL interoperability kernel invocation
+    if (!q.is_host()) {
+      cl_int err;
+      {
+        cl::sycl::context ctx = q.get_context();
+        cl_context clCtx = ctx.get();
+        cl_command_queue clQ = q.get();
+        cl_mem clBuffer = clCreateBuffer(
+            clCtx, CL_MEM_WRITE_ONLY, sizeof(int) * dataVec.size(), NULL, NULL);
+        err = clEnqueueWriteBuffer(clQ, clBuffer, CL_TRUE, 0,
+                                   sizeof(int) * dataVec.size(), dataVec.data(),
+                                   0, NULL, NULL);
+        assert(err == CL_SUCCESS);
+
+        cl::sycl::program prog(ctx);
+        prog.build_with_source(
+            "kernel void ParallelForND( local int* l,global int* a)"
+            "{  size_t idx = get_global_id(0);"
+            "  int pos = idx & 1;"
+            "  int opp = pos ^ 1;"
+            "  l[pos] = a[get_global_id(0)];"
+            "  barrier(CLK_LOCAL_MEM_FENCE);"
+            "  a[idx]=l[opp]; }");
+
+        // TODO is there no way to set local memory size via interoperability?
+        cl::sycl::kernel krn = prog.get_kernel("ParallelForND");
+        clSetKernelArg(krn.get(), 0, sizeof(int) * 2, NULL);
+
+        q.submit([&](cl::sycl::handler &cgh) {
+          cgh.set_arg(1, clBuffer);
+          cgh.parallel_for(cl::sycl::nd_range<1>(cl::sycl::range<1>(10),
+                                                 cl::sycl::range<1>(2)),
+                           krn);
+        });
+
+        q.wait();
+        err = clEnqueueReadBuffer(clQ, clBuffer, CL_TRUE, 0,
+                                  sizeof(int) * dataVec.size(), dataVec.data(),
+                                  0, NULL, NULL);
+        clReleaseCommandQueue(clQ);
+        clReleaseContext(clCtx);
+        assert(err == CL_SUCCESS);
+      }
+      for (size_t i = 0; i < dataVec.size(); ++i) {
+        assert(dataVec[i] == i);
+      }
+    }
+  }
+}
diff --git a/sycl/test/lit.cfg b/sycl/test/lit.cfg
new file mode 100644
index 000000000000..390ba1e2e32e
--- /dev/null
+++ b/sycl/test/lit.cfg
@@ -0,0 +1,98 @@
+# -*- Python -*-
+
+import os
+import platform
+import re
+import subprocess
+import tempfile
+
+import lit.formats
+import lit.util
+
+from lit.llvm import llvm_config
+
+# Configuration file for the 'lit' test runner.
+
+# name: The name of this test suite.
+config.name = 'SYCLUnitTests'
+
+# testFormat: The test format to use to interpret tests.
+#
+# For now we require '&&' between commands, until they get globally killed and
+# the test runner updated.
+config.test_format = lit.formats.ShTest()
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.c', '.cpp'] #add .spv. Currently not clear what to do with those
+
+config.excludes = ['CMakeLists.txt', 'run_tests.sh', 'README.txt']
+
+# test_source_root: The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+# test_exec_root: The root path where tests should be run.
+config.test_exec_root = os.path.join(config.sycl_dir, 'test')
+
+# Propagate 'LD_LIBRARY_PATH' through the environment.
+if 'LD_LIBRARY_PATH' in os.environ:
+    config.environment['LD_LIBRARY_PATH'] = os.path.pathsep.join((config.environment['LD_LIBRARY_PATH'], config.llvm_build_libs_dir))
+else:
+    config.environment['LD_LIBRARY_PATH'] = config.llvm_build_libs_dir
+
+config.substitutions.append( ('%clang', ' ' + config.clang + ' -I'+config.opencl_include ) )
+config.substitutions.append( ('%llvm_build_libs_dir',  config.llvm_build_libs_dir ) )
+config.substitutions.append( ('%opencl_include',  config.opencl_include ) )
+config.substitutions.append( ('%sycl_include',  config.sycl_include ) )
+
+tools = ['llvm-spirv']
+tool_dirs = [config.llvm_tools_dir]
+llvm_config.add_tool_substitutions(tools, tool_dirs)
+
+get_device_count_by_type_path = os.path.join(config.llvm_binary_dir,
+    "bin", "get_device_count_by_type")
+
+def getDeviceCount(device_type):
+    process = subprocess.Popen([get_device_count_by_type_path, device_type],
+        stdout=subprocess.PIPE)
+    (output, err) = process.communicate()
+    exit_code = process.wait()
+    if exit_code == 0:
+        result = output.replace('\n', '').split(':', 1)
+        try:
+            value = int(result[0])
+        except ValueError:
+            value = 0
+            print("getDeviceCount {TYPE}:Cannot get value from output.".format(
+                TYPE=device_type))
+        if len(result) > 1 and len(result[1]):
+            print("getDeviceCount {TYPE}:{MSG}".format(
+                TYPE=device_type, MSG=result[1]))
+        if err:
+            print("getDeviceCount {TYPE}:{ERR}".format(
+                TYPE=device_type, ERR=err))
+        return value
+    return 0
+
+
+cpu_run_substitute = "echo"
+if getDeviceCount("cpu"):
+    print("Found available CPU device")
+    cpu_run_substitute = "env SYCL_DEVICE_TYPE=CPU "
+config.substitutions.append( ('%CPU_RUN_PLACEHOLDER',  cpu_run_substitute) )
+
+gpu_run_substitute = "echo"
+if getDeviceCount("gpu"):
+    print("Found available GPU device")
+    gpu_run_substitute = " env SYCL_DEVICE_TYPE=GPU "
+config.substitutions.append( ('%GPU_RUN_PLACEHOLDER',  gpu_run_substitute) )
+
+acc_run_substitute = "echo"
+if getDeviceCount("accelerator"):
+    print("Found available accelerator device")
+    acc_run_substitute = " env SYCL_DEVICE_TYPE=ACC "
+config.substitutions.append( ('%ACC_RUN_PLACEHOLDER',  acc_run_substitute) )
+
+path = config.environment['PATH']
+path = os.path.pathsep.join((config.llvm_tools_dir, path))
+config.environment['PATH'] = path
+
diff --git a/sycl/test/lit.site.cfg.in b/sycl/test/lit.site.cfg.in
new file mode 100644
index 000000000000..c73ca09ebb39
--- /dev/null
+++ b/sycl/test/lit.site.cfg.in
@@ -0,0 +1,19 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+import sys
+
+config.clang = "@CLANG_IN_BUILD@"
+config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
+config.llvm_build_libs_dir = "@LLVM_BUILD_LIBRARY_DIRS@"
+config.llvm_binary_dir = "@LLVM_BINARY_DIR@"
+config.opencl_include = "@OPENCL_INCLUDE@"
+config.sycl_include = "@SYCL_INCLUDE@"
+
+config.sycl_tests_binary_dir = "@SYCL_TESTS_BINARY_DIR@"
+config.sycl_dir = "@SYCL_BINARY_DIR@"
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, "@SYCL_TESTS_ROOT_DIR@/lit.cfg")
diff --git a/sycl/test/math/math.cpp b/sycl/test/math/math.cpp
new file mode 100644
index 000000000000..9db4e4365b33
--- /dev/null
+++ b/sycl/test/math/math.cpp
@@ -0,0 +1,65 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl -lm
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==--------------- math.cpp - SYCL math test ------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <cassert>
+
+using namespace cl::sycl;
+
+int main() {
+  float data_1[10];
+  for (size_t i = 0; i < 10; ++i) {
+    data_1[i] = 1;
+  }
+  float data_2[10];
+  for (size_t i = 0; i < 10; ++i) {
+    data_2[i] = 2;
+  }
+  float result[10] = {0};
+
+  {
+    range<1> numOfItems{10};
+    buffer<float, 1> bufferData_1(data_1, numOfItems);
+    buffer<float, 1> bufferData_2(data_2, numOfItems);
+    buffer<float, 1> resultBuffer(result, numOfItems);
+    queue myQueue;
+    myQueue.submit([&](handler &cgh) {
+      accessor<float, 1, access::mode::read, access::target::global_buffer,
+               access::placeholder::false_t>
+          accessorData_1(bufferData_1, cgh);
+      accessor<float, 1, access::mode::read_write,
+               access::target::global_buffer, access::placeholder::false_t>
+          accessorData_2(bufferData_2, cgh);
+      accessor<float, 1, access::mode::read_write,
+               access::target::global_buffer, access::placeholder::false_t>
+          resultAccessor(resultBuffer, cgh);
+      cgh.parallel_for<class MathKernel>(range<1>{10}, [=](id<1> wiID) {
+        resultAccessor[wiID.get(0)] = cl::sycl::fmax(
+            accessorData_1[wiID.get(0)], accessorData_2[wiID.get(0)]);
+        resultAccessor[wiID.get(0)] += cl::sycl::fmin(1.f, 2.f);
+        resultAccessor[wiID.get(0)] += cl::sycl::native::exp(2.f);
+        resultAccessor[wiID.get(0)] += cl::sycl::fabs(-2.f);
+        resultAccessor[wiID.get(0)] += cl::sycl::fabs(1.0);
+      });
+    });
+  }
+
+  for (size_t i = 0; i < 10; ++i) {
+    /* Result of addition of 2 + 1 + 7.389... + 2 + 1*/
+    assert(result[i] > 13 && result[i] < 14 &&
+           "Expected result[i] > 13 &&  result[i] < 14");
+  }
+
+  return 0;
+}
diff --git a/sycl/test/multi_ptr/multi_ptr.cpp b/sycl/test/multi_ptr/multi_ptr.cpp
new file mode 100644
index 000000000000..b1006b82db17
--- /dev/null
+++ b/sycl/test/multi_ptr/multi_ptr.cpp
@@ -0,0 +1,149 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==--------------- multi_ptr.cpp - SYCL multi_ptr test --------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <cassert>
+#include <iostream>
+#include <type_traits>
+
+using namespace cl::sycl;
+
+/* This is the class used to name the kernel for the runtime.
+ * This must be done when the kernel is expressed as a lambda. */
+template <typename T> class testMultPtrKernel;
+template <typename T> class testMultPtrArrowOperatorKernel;
+
+template <typename T> struct point {
+  point(const point &rhs) : x(rhs.x), y(rhs.y) {}
+  point(T x, T y) : x(x), y(y) {}
+  point(T v) : x(v), y(v) {}
+  point() : x(0), y(0) {}
+  bool operator==(const T &rhs) { return rhs == x && rhs == y; }
+  bool operator==(const point<T> &rhs) { return rhs.x == x && rhs.y == y; }
+  T x;
+  T y;
+};
+
+template <typename T>
+void innerFunc(id<1> wiID, global_ptr<T> ptr_1, global_ptr<T> ptr_2,
+               local_ptr<T> local_ptr) {
+  T t = ptr_1[wiID.get(0)];
+  local_ptr[wiID.get(0)] = t;
+  t = local_ptr[wiID.get(0)];
+  ptr_2[wiID.get(0)] = t;
+}
+
+template <typename T> void testMultPtr() {
+  T data_1[10];
+  for (size_t i = 0; i < 10; ++i) {
+    data_1[i] = 1;
+  }
+  T data_2[10];
+  for (size_t i = 0; i < 10; ++i) {
+    data_2[i] = 2;
+  }
+
+  {
+    range<1> numOfItems{10};
+    buffer<T, 1> bufferData_1(data_1, numOfItems);
+    buffer<T, 1> bufferData_2(data_2, numOfItems);
+    queue myQueue;
+    myQueue.submit([&](handler &cgh) {
+      accessor<T, 1, access::mode::read, access::target::global_buffer,
+               access::placeholder::false_t>
+          accessorData_1(bufferData_1, cgh);
+      accessor<T, 1, access::mode::read_write, access::target::global_buffer,
+               access::placeholder::false_t>
+          accessorData_2(bufferData_2, cgh);
+      accessor<T, 1, access::mode::read_write, access::target::local>
+          localAccessor(numOfItems, cgh);
+
+      cgh.parallel_for<class testMultPtrKernel<T>>(range<1>{10}, [=](id<1> wiID) {
+        auto ptr_1 = make_ptr<T, access::address_space::global_space>(
+          accessorData_1.get_pointer());
+        auto ptr_2 = make_ptr<T, access::address_space::global_space>(
+          accessorData_2.get_pointer());
+        auto local_ptr = make_ptr<T, access::address_space::local_space>(
+          localAccessor.get_pointer());
+
+        innerFunc<T>(wiID.get(0), ptr_1, ptr_2, local_ptr);
+      });
+    });
+  }
+  for (size_t i = 0; i < 10; ++i) {
+    assert(data_1[i] == 1 && "Expected data_1[i] == 1");
+  }
+  for (size_t i = 0; i < 10; ++i) {
+    assert(data_2[i] == 1 && "Expected data_2[i] == 1");
+  }
+}
+
+template <typename T>
+void testMultPtrArrowOperator() {
+  point<T> data_1[1] = {1};
+  point<T> data_2[1] = {2};
+  point<T> data_3[1] = {3};
+
+  {
+    range<1> numOfItems{1};
+    buffer<point<T>, 1> bufferData_1(data_1, numOfItems);
+    buffer<point<T>, 1> bufferData_2(data_2, numOfItems);
+    buffer<point<T>, 1> bufferData_3(data_3, numOfItems);
+    queue myQueue;
+    myQueue.submit([&](handler &cgh) {
+      accessor<point<T>, 1, access::mode::read, access::target::global_buffer,
+               access::placeholder::false_t>
+          accessorData_1(bufferData_1, cgh);
+      accessor<point<T>, 1, access::mode::read_write, access::target::constant_buffer,
+               access::placeholder::false_t>
+          accessorData_2(bufferData_2, cgh);
+      accessor<point<T>, 1, access::mode::read_write, access::target::local,
+               access::placeholder::false_t>
+          accessorData_3(1, cgh);
+
+      cgh.single_task<class testMultPtrArrowOperatorKernel<T>>([=]() {
+        auto ptr_1 = make_ptr<point<T>, access::address_space::global_space>(
+            accessorData_1.get_pointer());
+        auto ptr_2 = make_ptr<point<T>, access::address_space::constant_space>(
+            accessorData_2.get_pointer());
+        auto ptr_3 = make_ptr<point<T>, access::address_space::local_space>(
+            accessorData_3.get_pointer());
+
+        auto x1 = ptr_1->x;
+        auto x2 = ptr_2->x;
+        auto x3 = ptr_3->x;
+
+        static_assert(std::is_same<decltype(x1), T>::value,
+                      "Expected decltype(ptr_1->x) == T");
+        static_assert(std::is_same<decltype(x2), T>::value,
+                      "Expected decltype(ptr_2->x) == T");
+        static_assert(std::is_same<decltype(x3), T>::value,
+                      "Expected decltype(ptr_3->x) == T");
+      });
+    });
+  }
+}
+
+int main() {
+  testMultPtr<int>();
+  testMultPtr<float>();
+  testMultPtr<point<int>>();
+  testMultPtr<point<float>>();
+
+  testMultPtrArrowOperator<int>();
+  testMultPtrArrowOperator<float>();
+
+  return 0;
+}
diff --git a/sycl/test/multisource/multisource.cpp b/sycl/test/multisource/multisource.cpp
new file mode 100644
index 000000000000..8889ceee8ba1
--- /dev/null
+++ b/sycl/test/multisource/multisource.cpp
@@ -0,0 +1,93 @@
+//==--------------- multisource.cpp ----------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Separate kernel sources and host code sources
+// RUN: %clang -std=c++11 -fsycl -c -o %t.kernel.o %s -DINIT_KERNEL -DCALC_KERNEL
+// RUN: %clang -std=c++11 -fsycl -c -o %t.main.o %s -DMAIN_APP
+// RUN: %clang -std=c++11 -fsycl %t.kernel.o %t.main.o -o %t.fat -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.fat
+// RUN: %CPU_RUN_PLACEHOLDER %t.fat
+// RUN: %GPU_RUN_PLACEHOLDER %t.fat
+// RUN: %ACC_RUN_PLACEHOLDER %t.fat
+
+// Multiple sources with kernel code
+// RUN: %clang -std=c++11 -fsycl -c -o %t.init.o %s -DINIT_KERNEL
+// RUN: %clang -std=c++11 -fsycl -c -o %t.calc.o %s -DCALC_KERNEL
+// RUN: %clang -std=c++11 -fsycl -c -o %t.main.o %s -DMAIN_APP
+// RUN: %clang -std=c++11 -fsycl %t.init.o %t.calc.o %t.main.o -o %t.fat -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.fat
+// RUN: %CPU_RUN_PLACEHOLDER %t.fat
+// RUN: %GPU_RUN_PLACEHOLDER %t.fat
+// RUN: %ACC_RUN_PLACEHOLDER %t.fat
+
+
+#include <CL/sycl.hpp>
+
+#include <iostream>
+
+using namespace cl::sycl;
+
+#ifdef MAIN_APP
+void init_buf(queue &q, buffer<int, 1> &b, range<1> &r, int i) ;
+#elif INIT_KERNEL
+void init_buf(queue &q, buffer<int, 1> &b, range<1> &r, int i){
+  q.submit([&](handler &cgh) {
+    auto B = b.get_access<access::mode::write>(cgh);
+    cgh.parallel_for<class init>(r, [=](id<1> index) { B[index] = i; });
+  });
+}
+#endif
+
+#ifdef MAIN_APP
+void calc_buf(queue &q, buffer<int, 1> &a, buffer<int, 1> &b,
+              buffer<int, 1> &c, range<1> &r);
+#elif CALC_KERNEL
+void calc_buf(queue &q, buffer<int, 1> &a, buffer<int, 1> &b,
+              buffer<int, 1> &c, range<1> &r){
+  q.submit([&](handler &cgh) {
+    auto A = a.get_access<access::mode::read>(cgh);
+    auto B = b.get_access<access::mode::read>(cgh);
+    auto C = c.get_access<access::mode::write>(cgh);
+    cgh.parallel_for<class calc>(
+        r, [=](id<1> index) { C[index] = A[index] - B[index]; });
+  });
+}
+#endif
+
+#ifdef MAIN_APP
+const size_t N = 100;
+int main() {
+  {
+    queue q;
+
+    range<1> r(N);
+    buffer<int, 1> a(r);
+    buffer<int, 1> b(r);
+    buffer<int, 1> c(r);
+
+    init_buf(q, a, r, 2);
+    init_buf(q, b, r, 1);
+
+    calc_buf(q, a, b, c, r);
+
+    auto C = c.get_access<access::mode::read>();
+    for (size_t i = 0; i < N; i++) {
+      if (C[i] != 1) {
+        std::cout << "Wrong value " << C[i] << " for element " << i
+                  << std::endl;
+        return -1;
+      }
+    }
+  }
+
+  std::cout << "Done!" << std::endl;
+  return 0;
+}
+#endif
+
diff --git a/sycl/test/program_manager/program_manager.cpp b/sycl/test/program_manager/program_manager.cpp
new file mode 100644
index 000000000000..7d1877dc5e5d
--- /dev/null
+++ b/sycl/test/program_manager/program_manager.cpp
@@ -0,0 +1,44 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==--- program_manager.cpp - SYCL program manager test --------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/detail/program_manager/program_manager.hpp>
+
+#include <cassert>
+
+using namespace cl::sycl;
+
+int main() {
+  context ContextFirst;
+  context ContextSecond;
+
+  auto &PM = detail::ProgramManager::getInstance();
+
+  const cl_program ClProgramFirst = PM.getBuiltOpenCLProgram(ContextFirst);
+  const cl_program ClProgramSecond = PM.getBuiltOpenCLProgram(ContextSecond);
+  // The check what getBuiltOpenCLProgram returns unique cl_program for unique
+  // context
+  assert(ClProgramFirst != ClProgramSecond);
+  for (size_t i = 0; i < 10; ++i) {
+    const cl_program ClProgramFirstNew = PM.getBuiltOpenCLProgram(ContextFirst);
+    const cl_program ClProgramSecondNew =
+        PM.getBuiltOpenCLProgram(ContextSecond);
+    // The check what getBuiltOpenCLProgram returns the same program for the
+    // same context each time
+    assert(ClProgramFirst == ClProgramFirstNew);
+    assert(ClProgramSecond == ClProgramSecondNew);
+  }
+
+  return 0;
+}
diff --git a/sycl/test/regression/kernel_name_class.cpp b/sycl/test/regression/kernel_name_class.cpp
new file mode 100644
index 000000000000..40a2678d8638
--- /dev/null
+++ b/sycl/test/regression/kernel_name_class.cpp
@@ -0,0 +1,250 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==-- kernel_name_class.cpp - SYCL kernel naming variants test ------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <iostream>
+
+#define GOLD 10
+
+namespace nm1 {
+namespace nm2 {
+class C {};
+class KernelName0 : public C {};
+} // namespace nm2
+
+class KernelName1;
+
+template <typename T> class KernelName3;
+template <typename T> class KernelName4;
+
+template <> class KernelName3<nm1::nm2::KernelName0>;
+template <> class KernelName3<KernelName1>;
+
+template <> class KernelName4<nm1::nm2::KernelName0> {};
+template <> class KernelName4<KernelName1> {};
+
+} // namespace nm1
+
+static int NumTestCases = 0;
+
+namespace nm3 {
+struct Wrapper {
+
+  class KN100 {};
+  class KN101;
+
+  int test() {
+    int arr[] = {0};
+    {
+      cl::sycl::queue deviceQueue;
+      cl::sycl::buffer<int, 1> buf(arr, 1);
+      // Acronyms used to designate a test combination:
+      //   Declaration levels: 'T'-translation unit, 'L'-local scope,
+      //                       'C'-containing class, 'P'-"in place", '-'-N/A
+      //   Class definition:   'I'-incomplete (not defined), 'D' - defined,
+      //   '-'-N/A
+      // Test combination positional parameters:
+      // 0: Kernel class declaration level
+      // 1: Kernel class definition
+      // 2: Declaration level of the template argument class of the kernel class
+      // 3: Definition of the template argument class of the kernel class
+
+      // PI--
+      // traditional in-place incomplete type
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<class KernelName>([=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+
+      // TD--
+      // a class completely defined within a namespace at
+      // translation unit scope
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<nm1::nm2::KernelName0>([=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+
+#ifdef LI__
+      // TODO unexpected compilation error when host code + integration header
+      // is compiled LI-- kernel name is an incomplete class forward-declared in
+      // local scope
+      class KernelName2;
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<KernelName2>([=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+#endif
+
+#ifdef LD__
+      // LD--
+      // kernel name is a class defined in local scope
+      class KernelName2a {};
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<KernelName2a>([=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+#endif
+
+      // TI--
+      // an incomplete class forward-declared in a namespace at
+      // translation unit scope
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<nm1::KernelName1>([=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+
+      // TITD
+      // an incomplete template specialization class with defined class as
+      // argument declared in a namespace at translation unit scope
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<nm1::KernelName3<nm1::nm2::KernelName0>>(
+            [=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+
+      // TITI
+      // an incomplete template specialization class with incomplete class as
+      // argument forward-declared in a namespace at translation unit scope
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<nm1::KernelName3<nm1::KernelName1>>(
+            [=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+
+      // TDTD
+      // a defined template specialization class with defined class as argument
+      // declared in a namespace at translation unit scope
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<nm1::KernelName4<nm1::nm2::KernelName0>>(
+            [=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+
+      // TDTI
+      // a defined template specialization class with incomplete class as
+      // argument forward-declared in a namespace at translation unit scope
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<nm1::KernelName4<nm1::KernelName1>>(
+            [=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+
+      // TIPI
+      // an incomplete template specialization class with incomplete class as
+      // argument forward-declared "in-place"
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<nm1::KernelName3<class KernelName5>>(
+            [=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+
+#ifdef TILI
+      // Expected compilation error
+      // TILI
+      // an incomplete template specialization class with incomplete class as
+      // argument forward-declared locally
+      class KernelName6;
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<nm1::KernelName3<KernelName6>>(
+            [=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+#endif
+
+      // TDPI
+      // a defined template specialization class with incomplete class as
+      // argument forward-declared "in-place"
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<nm1::KernelName4<class KernelName7>>(
+            [=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+
+#ifdef TDLI
+      // TODO unexpected compilation error when host code + integration header
+      // is compiled TDLI a defined template specialization class with
+      // incomplete class as argument forward-declared locally
+      class KernelName6a;
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<nm1::KernelName4<KernelName6a>>(
+            [=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+#endif
+
+#ifdef TDLD
+      // Expected compilation error
+      // TDLD
+      // a defined template specialization class with a class as argument
+      // defined locally
+      class KernelName9 {};
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<nm1::KernelName4<KernelName9>>(
+            [=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+#endif
+
+#ifdef TICD
+      // Expected compilation error
+      // TICD
+      // an incomplete template specialization class with a defined class as
+      // argument declared in the containing class
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<nm1::KernelName3<KN100>>([=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+#endif
+
+#ifdef TICI
+      // Expected compilation error
+      // TICI
+      // an incomplete template specialization class with an incomplete class as
+      // argument declared in the containing class
+      deviceQueue.submit([&](cl::sycl::handler &cgh) {
+        auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+        cgh.single_task<nm1::KernelName3<KN101>>([=]() { acc[0] += GOLD; });
+      });
+      ++NumTestCases;
+#endif
+    }
+    return arr[0];
+  }
+};
+} // namespace nm3
+
+int main() {
+  nm3::Wrapper w;
+  int res = w.test();
+  bool pass = res == GOLD * NumTestCases;
+  std::cout << (pass ? "pass" : "FAIL") << "\n";
+  return pass ? 0 : 1;
+}
+
diff --git a/sycl/test/scheduler/Dump.cpp b/sycl/test/scheduler/Dump.cpp
new file mode 100644
index 000000000000..169f3a9b5bd2
--- /dev/null
+++ b/sycl/test/scheduler/Dump.cpp
@@ -0,0 +1,39 @@
+//==--------------- Dump.cpp - Test SYCL scheduler graph dumping -----------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: %t.out
+// RUN: env SS_DUMP_TEXT=1 %t.out
+// RUN: env SS_DUMP_WHOLE_GRAPH=1 %t.out
+// RUN: env SS_DUMP_RUN_GRAPH=1 %t.out
+
+#include <CL/sycl.hpp>
+
+#include <cassert>
+#include <cstdlib>
+
+using namespace cl::sycl::simple_scheduler;
+
+int main() {
+  const bool TextFlag = Scheduler::getInstance().getDumpFlagValue(
+      Scheduler::DumpOptions::Text);
+  const bool TextEnv = std::getenv("SS_DUMP_TEXT");
+  assert(TextFlag == TextEnv);
+
+  const bool WholeGraphFlag = Scheduler::getInstance().getDumpFlagValue(
+      Scheduler::DumpOptions::WholeGraph);
+  const bool WholeGraphEnv = std::getenv("SS_DUMP_WHOLE_GRAPH");
+  assert(WholeGraphFlag == WholeGraphEnv);
+
+  const bool RunGraphFlag = Scheduler::getInstance().getDumpFlagValue(
+      Scheduler::DumpOptions::RunGraph);
+  const bool RunGraphEnv = std::getenv("SS_DUMP_RUN_GRAPH");
+  assert(RunGraphFlag == RunGraphEnv);
+
+  return 0;
+}
diff --git a/sycl/test/scheduler/MultipleDevices.cpp b/sycl/test/scheduler/MultipleDevices.cpp
new file mode 100644
index 000000000000..23b9646dc01e
--- /dev/null
+++ b/sycl/test/scheduler/MultipleDevices.cpp
@@ -0,0 +1,103 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: %t.out
+//===- MultipleDevices.cpp - Test checkking multi-device execution --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+#include <iostream>
+
+using namespace cl::sycl;
+
+int main() {
+  const size_t N = 100;
+  {
+    host_selector HOSTSelector;
+    queue MyQueue1(HOSTSelector);
+    queue MyQueue2(HOSTSelector);
+    try {
+      cpu_selector CPUSelector;
+      MyQueue2 = queue(CPUSelector);
+    } catch (cl::sycl::invalid_parameter_error &) {
+      std::cout << "Using 2 host devices." << std::endl;
+    }
+
+    buffer<int, 1> BufA(range<1>{N});
+    buffer<int, 1> BufB(range<1>{N});
+    buffer<int, 1> BufC(range<1>{N});
+    buffer<int, 1> BufD(range<1>{N});
+
+    MyQueue1.submit([&](handler &cgh) {
+      auto A = BufA.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class init_a>(
+          range<1>{N}, [=](id<1> index) { A[index[0]] = index[0]; });
+    });
+
+    MyQueue2.submit([&](handler &cgh) {
+      auto B = BufB.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class init_b>(
+          range<1>{N}, [=](id<1> index) { B[index[0]] = N - index[0]; });
+    });
+
+    MyQueue2.submit([&](handler& cgh) {
+      auto A = BufA.get_access<access::mode::read>(cgh);
+      auto B = BufB.get_access<access::mode::read_write>(cgh);
+      auto C = BufC.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class op1>(range<1>{N}, [=](id<1> index) {
+        B[index[0]] = B[index[0]] + A[index[0]];
+        C[index[0]] = B[index[0]] - index[0];
+      });
+    });
+
+    MyQueue2.submit([&](handler &cgh) {
+      auto D = BufD.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class init_d>(range<1>{N},
+                                     [=](id<1> index) { D[index[0]] = 1; });
+    });
+
+    MyQueue1.submit([&](handler& cgh) {
+      auto B = BufB.get_access<access::mode::read>(cgh);
+      auto C = BufC.get_access<access::mode::read>(cgh);
+      auto D = BufD.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<class op2>(range<1>{N}, [=](id<1> index) {
+        D[index[0]] = D[index[0]] + B[index[0]] - C[index[0]];
+      });
+    });
+
+    auto FinalD = BufD.get_access<access::mode::read>();
+    std::cout << "Result:" << std::endl;
+    for (size_t i = 0; i < N; i++) {
+
+      // A[index[0]] = index[0];
+      int A = i;
+      // B[index[0]] = N - index[0];
+      int B = N - i;
+      // B[index[0]] = B[index[0]] + A[index[0]];
+      B = B + A;
+      // C[index[0]] = B[index[0]] - index[0];
+      int C = B - i;
+      // D[index[0]] = 1;
+      int D = 1;
+      // D[index[0]] = D[index[0]] + B[index[0]] - C[index[0]];
+      D = D + B - C;
+
+      int Expected = D;
+
+      if (FinalD[i] != D) {
+        std::cout << "Wrong value for element " << i
+                  << " Expected: " << Expected << " Got: " << FinalD[i]
+                  << std::endl;
+        return -1;
+      }
+    }
+  }
+
+  std::cout << "Good computation!" << std::endl;
+  return 0;
+}
diff --git a/sycl/test/scheduler/parallelReadOpt.cpp b/sycl/test/scheduler/parallelReadOpt.cpp
new file mode 100644
index 000000000000..c2719a6cfb89
--- /dev/null
+++ b/sycl/test/scheduler/parallelReadOpt.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out | FileCheck %s
+//
+// CHECK:Buffer A [[A:.*]]
+// CHECK:Evolution of buffer [[A]]
+// CHECK-NEXT:ID = [[ALLOCA_A:[0-9]+]] ; ALLOCA ON [[DEVICE_TYPE:.*]]
+// CHECK-NEXT:  Buf : [[A]]  Access : read_write
+// CHECK-NEXT:    Dependency:
+// CHECK-NEXT:ID = [[INIT:[0-9]+]] ; RUN_KERNEL init_kernel ON [[DEVICE_TYPE]]
+// CHECK-NEXT:    Dependency:
+// CHECK-NEXT:        Dep on buf [[A]] write from Command ID = {{[0-9]+}}
+// CHECK-NEXT:ID = [[READ1:[0-9]+]] ; RUN_KERNEL read1 ON [[DEVICE_TYPE]]
+// CHECK-NEXT:    Dependency:
+// CHECK-DAG:        Dep on buf [[B:.*]] write from Command ID = {{[0-9]+}}
+// CHECK-DAG:        Dep on buf [[A]] read from Command ID = [[INIT]]
+// CHECK-NEXT:ID = [[READ2:[0-9]+]] ; RUN_KERNEL read2 ON [[DEVICE_TYPE]]
+// CHECK-NEXT:    Dependency:
+// CHECK-NEXT:        Dep on buf [[C:.*]] write from Command ID = {{[0-9]+}}
+// CHECK-NEXT:        Dep on buf [[A]] read from Command ID = [[INIT]]
+
+//==---- parallelReadOpt.cpp - SYCL scheduler parallel read test -----------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "CL/sycl.hpp"
+#include "CL/sycl/detail/scheduler/scheduler.h"
+
+using namespace cl::sycl;
+static constexpr const detail::kernel_param_desc_t kernel_signatures[] = {
+    //--- init_kernel
+    {detail::kernel_param_kind_t::kind_accessor, 2014, 0},
+    //--- read1 and read2
+    {detail::kernel_param_kind_t::kind_accessor, 2014, 0},
+    {detail::kernel_param_kind_t::kind_accessor, 2014, 192}};
+
+int main() {
+
+  queue Queue;
+  auto QueueImpl = detail::getSyclObjImpl(Queue);
+  const size_t N = 10;
+
+  buffer<float, 1> A(range<1>{N});
+  buffer<float, 1> B(range<1>{N});
+  buffer<float, 1> C(range<1>{N});
+
+  { // Adding node that requires write access to A
+    simple_scheduler::Node InitNode(QueueImpl);
+    InitNode.template addBufRequirement<access::mode::write,
+                                        access::target::global_buffer>(
+        *detail::getSyclObjImpl(A));
+    InitNode.addKernel("init_kernel", 1, kernel_signatures, []() {});
+    simple_scheduler::Scheduler::getInstance().addNode(std::move(InitNode));
+  }
+
+  { // Adding node that requires read access to A, write to B
+    simple_scheduler::Node ReadNode1(QueueImpl);
+    ReadNode1.template addBufRequirement<access::mode::read,
+                                         access::target::global_buffer>(
+        *detail::getSyclObjImpl(A));
+    ReadNode1.template addBufRequirement<access::mode::write,
+                                         access::target::global_buffer>(
+        *detail::getSyclObjImpl(B));
+    ReadNode1.addKernel("read1", 2, kernel_signatures + 2, []() {});
+    simple_scheduler::Scheduler::getInstance().addNode(std::move(ReadNode1));
+  }
+
+  { // Adding node that requires read access to A, write to C
+    simple_scheduler::Node ReadNode2(QueueImpl);
+    ReadNode2.template addBufRequirement<access::mode::read,
+                                         access::target::global_buffer>(
+        *detail::getSyclObjImpl(A));
+    ReadNode2.template addBufRequirement<access::mode::write,
+                                         access::target::global_buffer>(
+        *detail::getSyclObjImpl(C));
+    ReadNode2.addKernel("read2", 2, kernel_signatures + 2, []() {});
+    simple_scheduler::Scheduler::getInstance().addNode(std::move(ReadNode2));
+  }
+
+  std::cout << "Buffer A " << detail::getSyclObjImpl(A).get() << std::endl;
+
+  // Expected that read2 kernel doesn't depend on read1.
+  simple_scheduler::Scheduler::getInstance().parallelReadOpt();
+
+  simple_scheduler::Scheduler::getInstance().dump();
+}
diff --git a/sycl/test/separate-compile/test.cpp b/sycl/test/separate-compile/test.cpp
new file mode 100755
index 000000000000..fb1d108e5cc9
--- /dev/null
+++ b/sycl/test/separate-compile/test.cpp
@@ -0,0 +1,121 @@
+// >> ---- compile src1
+// >> device compilation...
+// RUN: %clang -std=c++11 --sycl -Xclang -fsycl-int-header=sycl_ihdr_a.h %s -c -o a_kernel.spv
+// >> host compilation...
+// RUN: %clang -std=c++11 -include sycl_ihdr_a.h -g -c %s -o a.o
+//
+// >> ---- compile src2
+// >> device compilation...
+// RUN: %clang -DB_CPP=1 -std=c++11 --sycl -Xclang -fsycl-int-header=sycl_ihdr_b.h %s -c -o b_kernel.spv
+// >> host compilation...
+// RUN: %clang -DB_CPP=1 -std=c++11 -include sycl_ihdr_b.h -g -c %s -o b.o
+//
+// >> ---- bundle .o with .spv
+// >> run bundler
+// RUN: clang-offload-bundler -type=o -targets=host-x86_64,sycl-spir64-pc-linux-gnu -inputs=a.o,a_kernel.spv -outputs=a_fat.o
+// RUN: clang-offload-bundler -type=o -targets=host-x86_64,sycl-spir64-pc-linux-gnu -inputs=b.o,b_kernel.spv -outputs=b_fat.o
+//
+// >> ---- unbundle fat objects
+// RUN: clang-offload-bundler -type=o -targets=host-x86_64,sycl-spir64-pc-linux-gnu -outputs=a.o,a_kernel.spv -inputs=a_fat.o -unbundle
+// RUN: clang-offload-bundler -type=o -targets=host-x86_64,sycl-spir64-pc-linux-gnu -outputs=b.o,b_kernel.spv -inputs=b_fat.o -unbundle
+//
+// >> ---- link device code
+// >> convert to bitcode
+// RUN: llvm-spirv -r -o=a_kernel.bc a_kernel.spv
+// RUN: llvm-spirv -r -o=b_kernel.bc b_kernel.spv
+//
+// >> link bitcode
+// RUN: llvm-link -o=app.bc a_kernel.bc b_kernel.bc
+//
+// >> convert linked .bc to spirv
+// RUN: llvm-spirv -o=app.spv app.bc
+//
+// >> ---- wrap device binary
+// >> produce .bc
+// RUN: clang-offload-wrapper -o wrapper.bc -target=sycl-x86_64-pc-linux-gnu -emit-entry-table=0 app.spv
+//
+// >> compile .bc to .o
+// RUN: llc -filetype=obj wrapper.bc -o wrapper.o
+//
+// >> ---- link the full hetero app
+// RUN: %clang wrapper.o a.o b.o -o app.exe -lstdc++ -lOpenCL -lsycl
+// RUN: ./app.exe | FileCheck %s
+// CHECK: pass
+
+//==----------- test.cpp - Tests SYCL separate compilation -----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifdef B_CPP
+// -----------------------------------------------------------------------------
+#include <CL/sycl.hpp>
+
+int run_test_b(int v) {
+  int arr[] = {v};
+  {
+    cl::sycl::queue deviceQueue;
+    cl::sycl::buffer<int, 1> buf(arr, 1);
+    deviceQueue.submit([&](cl::sycl::handler &cgh) {
+      auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+      cgh.single_task<class kernel_b>([=]() { acc[0] *= 3; });
+    });
+  }
+  return arr[0];
+}
+
+#else // !B_CPP
+
+// -----------------------------------------------------------------------------
+#include <CL/sycl.hpp>
+#include <iostream>
+
+using namespace std;
+
+const int VAL = 10;
+
+extern int run_test_b(int);
+
+int run_test_a(int v) {
+  int arr[] = {v};
+  {
+    cl::sycl::queue deviceQueue;
+    cl::sycl::buffer<int, 1> buf(arr, 1);
+    deviceQueue.submit([&](cl::sycl::handler &cgh) {
+      auto acc = buf.get_access<cl::sycl::access::mode::read_write>(cgh);
+      cgh.single_task<class kernel_a>([=]() { acc[0] *= 2; });
+    });
+  }
+  return arr[0];
+}
+
+int main(int argc, char **argv) {
+  bool pass = true;
+
+  int test_a = run_test_a(VAL);
+  const int GOLD_A = 2 * VAL;
+
+  if (test_a != GOLD_A) {
+    std::cout << "FAILD test_a. Expected: " << GOLD_A << ", got: " << test_a
+              << "\n";
+    pass = false;
+  }
+
+  int test_b = run_test_b(VAL);
+  const int GOLD_B = 3 * VAL;
+
+  if (test_b != GOLD_B) {
+    std::cout << "FAILD test_b. Expected: " << GOLD_B << ", got: " << test_b
+              << "\n";
+    pass = false;
+  }
+
+  if (pass) {
+    std::cout << "pass\n";
+  }
+  return pass ? 0 : 1;
+}
+#endif // !B_CPP
diff --git a/sycl/test/struct_param/struct_kernel_param.cpp b/sycl/test/struct_param/struct_kernel_param.cpp
new file mode 100644
index 000000000000..e42bc67375c5
--- /dev/null
+++ b/sycl/test/struct_param/struct_kernel_param.cpp
@@ -0,0 +1,126 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==-struct_kernel_param.cpp-Checks passing structs as kernel params--------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <cstring>
+#include <iostream>
+
+using namespace cl::sycl;
+
+struct MyNestedStruct {
+  cl::sycl::cl_char FldArr[1];
+  cl::sycl::cl_float FldFloat;
+};
+
+struct MyStruct {
+  cl::sycl::cl_char FldChar;
+  cl::sycl::cl_long FldLong;
+  cl::sycl::cl_short FldShort;
+  cl::sycl::cl_uint FldUint;
+  MyNestedStruct FldStruct;
+  cl::sycl::cl_short FldArr[3];
+  cl::sycl::cl_int FldInt;
+};
+
+MyStruct GlobS;
+
+static void printStruct(const MyStruct &S0) {
+  std::cout << "{ " << (int)S0.FldChar << ", " << S0.FldLong << ", "
+            << S0.FldShort << ", " << S0.FldUint << " { { "
+            << (int)S0.FldStruct.FldArr[0] << " }, " << S0.FldStruct.FldFloat
+            << " }, { " << S0.FldArr[0] << ", " << S0.FldArr[1] << ", "
+            << S0.FldArr[2] << " }, " << S0.FldInt << " }";
+}
+
+bool test0() {
+  MyStruct S = GlobS;
+  MyStruct S0 = { 0 };
+  {
+    buffer<MyStruct, 1> Buf(&S0, range<1>(1));
+    queue myQueue;
+    myQueue.submit([&](handler &cgh) {
+      auto B = Buf.get_access<access::mode::write>(cgh);
+      cgh.single_task<class MyKernel>([=] { B[0] = S; });
+    });
+  }
+  bool Passed = (std::memcmp(&S0, &S, sizeof(MyStruct)) == 0);
+
+  if (!Passed) {
+    std::cout << "test0 failed" << std::endl;
+    std::cout << "test0 input:" << std::endl;
+    printStruct(S);
+    std::cout << std::endl;
+    std::cout << "test0 result:\n";
+    printStruct(S0);
+    std::cout << std::endl;
+  }
+  return Passed;
+}
+
+bool test1() {
+  range<3> ice(8, 9, 10);
+  uint ice2 = 888;
+  uint result[4] = { 0 };
+
+  {
+    buffer<unsigned int, 1> Buffer((unsigned int *)result, range<1>(4));
+    queue myQueue;
+    myQueue.submit([&](handler &cgh) {
+      auto B = Buffer.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class bufferByRange_cap>(range<1>{ 4 },
+                                                [=](id<1> index) {
+        B[index.get(0)] = index.get(0) > 2 ? ice2 : ice.get(index.get(0));
+      });
+    });
+  }
+
+  bool Passed = true;
+
+  for (unsigned long i = 0; i < 4; ++i) {
+    if (i <= 2) {
+      if (result[i] != ice[i])
+        Passed = false;
+    } else {
+      if (result[i] != ice2)
+        Passed = false;
+    }
+  }
+  if (!Passed)
+    std::cout << "test1 failed" << std::endl;
+
+  return Passed;
+}
+
+int main(int argc, char **argv) {
+  cl::sycl::cl_char PartChar = argc;
+  cl::sycl::cl_short PartShort = argc << 8;
+  cl::sycl::cl_int PartInt = argc << 16;
+  cl::sycl::cl_uint PartUint = argc << 16;
+  cl::sycl::cl_long PartLong = ((cl::sycl::cl_long)argc) << 32;
+  cl::sycl::cl_float PartFloat = argc;
+
+  GlobS = { PartChar,
+            PartLong,
+            PartShort,
+            PartUint,
+            { { PartChar }, PartFloat },
+            { PartShort, PartShort, PartShort },
+            PartInt };
+
+  bool Pass = test0() & test1();
+
+  std::cout << "Test " << (Pass ? "passed" : "FAILED") << std::endl;
+  return Pass ? 0 : 1;
+}
+
diff --git a/sycl/test/sub_group/barrier.cpp b/sycl/test/sub_group/barrier.cpp
new file mode 100644
index 000000000000..dc61f9bd4c1f
--- /dev/null
+++ b/sycl/test/sub_group/barrier.cpp
@@ -0,0 +1,94 @@
+// RUN: %clang -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// TODO: Enable when use SPIRV operations instead direct built-ins calls.
+// RUNx: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==---------- barrier.cpp - SYCL sub_group barrier test -------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+#include <limits>
+#include <numeric>
+template <typename T> class sycl_subgr;
+using namespace cl::sycl;
+template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
+  try {
+    nd_range<1> NdRange(G, L);
+    std::vector<T> data(G);
+    std::iota(data.begin(), data.end(), sizeof(T));
+    buffer<T> addbuf(data.data(), range<1>(G));
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      auto addacc = addbuf.template get_access<access::mode::read_write>(cgh);
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
+
+      cgh.parallel_for<sycl_subgr<T>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        size_t lid = SG.get_local_id().get(0);
+        size_t gid = NdItem.get_global_id(0);
+        size_t SGoff = gid - lid;
+
+        T res = 0;
+        for (size_t i = 0; i <= lid; i++) {
+          res += addacc[SGoff + i];
+        }
+        SG.barrier(access::fence_space::global_space);
+        addacc[gid] = res;
+        if (NdItem.get_global_id(0) == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+      });
+    });
+    auto addacc = addbuf.template get_access<access::mode::read_write>();
+
+    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+    size_t sg_size = sgsizeacc[0];
+    int WGid = -1, SGid = 0;
+    T add = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+        add = 0;
+      }
+      if (j % L == 0) {
+        WGid++;
+        SGid = 0;
+      }
+      add += j + sizeof(T);
+      exit_if_not_equal<T>(addacc[j], add, "barrier");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  /* Limit work-group size to avoid type overflow. */
+  check<char>(Queue, 120, 30);
+  check<short>(Queue, 1024, 256);
+  check<int>(Queue);
+  check<uint>(Queue);
+  check<long>(Queue);
+  check<ulong>(Queue);
+  if (!Queue.get_device().has_extension("cl_khr_fp16")) {
+    check<half>(Queue);
+  }
+  check<float>(Queue);
+  if (!Queue.get_device().has_extension("cl_khr_fp64")) {
+    check<double>(Queue);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/sycl/test/sub_group/broadcast.cpp b/sycl/test/sub_group/broadcast.cpp
new file mode 100644
index 000000000000..70561e4c2d02
--- /dev/null
+++ b/sycl/test/sub_group/broadcast.cpp
@@ -0,0 +1,79 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// TODO: Enable when use SPIRV operations instead direct built-ins calls.
+// RUNx: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==--------- broadcast.cpp - SYCL sub_group broadcast test ----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+template <typename T> class sycl_subgr;
+using namespace cl::sycl;
+template <typename T> void check(queue &Queue) {
+  const int G = 240, L = 60;
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> syclbuf(G);
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      auto syclacc = syclbuf.template get_access<access::mode::read_write>(cgh);
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<sycl_subgr<T>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        if (NdItem.get_global_id(0) == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+        /*Broadcast GID of element with SGLID == SGID */
+        syclacc[NdItem.get_global_id()] =
+            SG.broadcast<T>(NdItem.get_global_id(0), SG.get_group_id());
+      });
+    });
+    auto syclacc = syclbuf.template get_access<access::mode::read_write>();
+    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+    size_t sg_size = sgsizeacc[0];
+    int WGid = -1, SGid = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+      }
+      if (j % L == 0) {
+        WGid++;
+        SGid = 0;
+      }
+      exit_if_not_equal<T>(syclacc[j], L * WGid + SGid + SGid * sg_size,
+                           "broadcasted value");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check<char>(Queue);
+  check<short>(Queue);
+  check<int>(Queue);
+  check<uint>(Queue);
+  check<long>(Queue);
+  check<ulong>(Queue);
+  if (!Queue.get_device().has_extension("cl_khr_fp16")) {
+    check<half>(Queue);
+  }
+  check<float>(Queue);
+  if (!Queue.get_device().has_extension("cl_khr_fp64")) {
+    check<double>(Queue);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/sycl/test/sub_group/common.cpp b/sycl/test/sub_group/common.cpp
new file mode 100644
index 000000000000..f63795b7879e
--- /dev/null
+++ b/sycl/test/sub_group/common.cpp
@@ -0,0 +1,83 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// TODO: Enable when use SPIRV operations instead direct built-ins calls.
+// RUNx: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==-------------- common.cpp - SYCL sub_group common test -----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+struct Data {
+  size_t local_id;
+  size_t local_range;
+  size_t max_local_range;
+  size_t group_id;
+  size_t group_range;
+  size_t uniform_group_range;
+};
+
+void check(queue &Queue, const int G, const int L) {
+
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<struct Data, 1> syclbuf(G);
+
+    Queue.submit([&](handler &cgh) {
+      auto syclacc = syclbuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<class sycl_subgr>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        syclacc[NdItem.get_global_id()].local_id = SG.get_local_id().get(0);
+        syclacc[NdItem.get_global_id()].local_range =
+            SG.get_local_range().get(0);
+        syclacc[NdItem.get_global_id()].max_local_range =
+            SG.get_max_local_range().get(0);
+        syclacc[NdItem.get_global_id()].group_id = SG.get_group_id().get(0);
+        syclacc[NdItem.get_global_id()].group_range = SG.get_group_range();
+        syclacc[NdItem.get_global_id()].uniform_group_range =
+            SG.get_uniform_group_range();
+      });
+    });
+    auto syclacc = syclbuf.get_access<access::mode::read_write>();
+    size_t max_sg =
+        Queue.get_device().get_info<info::device::max_num_sub_groups>();
+    size_t num_sg = L / max_sg + (L % max_sg ? 1 : 0);
+    for (int j = 0; j < G; j++) {
+      size_t group_id = j % L / max_sg;
+      size_t local_range =
+          (group_id + 1 == num_sg) ? (L - group_id * max_sg) : max_sg;
+      exit_if_not_equal(syclacc[j].local_id, j % L % max_sg, "local_id");
+      exit_if_not_equal(syclacc[j].local_range, local_range, "local_range");
+      exit_if_not_equal(syclacc[j].max_local_range, max_sg, "max_local_range");
+      exit_if_not_equal(syclacc[j].group_id, group_id, "group_id");
+      exit_if_not_equal(syclacc[j].group_range, num_sg, "group_range");
+      exit_if_not_equal(syclacc[j].uniform_group_range, num_sg,
+                        "uniform_group_range");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  check(Queue, 240, 80);
+  check(Queue, 8, 4);
+  check(Queue, 24, 12);
+  check(Queue, 1024, 256);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/sycl/test/sub_group/common_ocl.cpp b/sycl/test/sub_group/common_ocl.cpp
new file mode 100644
index 000000000000..eba8a48d3a7c
--- /dev/null
+++ b/sycl/test/sub_group/common_ocl.cpp
@@ -0,0 +1,106 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// TODO: Enable when use SPIRV operations instead direct built-ins calls.
+// RUNx: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==--- common_ocl.cpp - basic SG methods in SYCL vs OpenCL  ---------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+struct Data {
+  size_t local_id;
+  size_t local_range;
+  size_t max_local_range;
+  size_t group_id;
+  size_t group_range;
+  size_t uniform_group_range;
+};
+
+void check(queue &Queue, const int G, const int L) {
+
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<struct Data, 1> oclbuf(G);
+    buffer<struct Data, 1> syclbuf(G);
+
+    program Prog(Queue.get_context());
+    Prog.build_with_source(
+        "struct Data { size_t local_id; size_t local_range; size_t "
+        "max_local_range; size_t group_id; size_t group_range; \n"
+        "size_t uniform_group_range; };\n"
+        "kernel void ocl_subgr(global struct Data* a) {\n"
+        "size_t id = get_global_id(0);"
+        "a[id].local_id = get_sub_group_local_id();\n"
+        "a[id].local_range = get_sub_group_size();\n"
+        "a[id].max_local_range = get_max_sub_group_size();\n"
+        "a[id].group_id = get_sub_group_id();\n"
+        "a[id].group_range = get_num_sub_groups();\n"
+        "a[id].uniform_group_range = get_num_sub_groups(); }");
+    Queue.submit([&](handler &cgh) {
+      auto oclacc = oclbuf.get_access<access::mode::read_write>(cgh);
+      cgh.set_args(oclacc);
+      cgh.parallel_for(NdRange, Prog.get_kernel("ocl_subgr"));
+    });
+    size_t NumSG = Prog.get_kernel("ocl_subgr")
+                       .get_sub_group_info<
+                           info::kernel_sub_group::sub_group_count_for_ndrange>(
+                           Queue.get_device(), range<3>(G, 1, 1));
+    auto oclacc = oclbuf.get_access<access::mode::read_write>();
+
+    Queue.submit([&](handler &cgh) {
+      auto syclacc = syclbuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<class sycl_subgr>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        syclacc[NdItem.get_global_id()].local_id = SG.get_local_id().get(0);
+        syclacc[NdItem.get_global_id()].local_range =
+            SG.get_local_range().get(0);
+        syclacc[NdItem.get_global_id()].max_local_range =
+            SG.get_max_local_range().get(0);
+        syclacc[NdItem.get_global_id()].group_id = SG.get_group_id().get(0);
+        syclacc[NdItem.get_global_id()].group_range = SG.get_group_range();
+        syclacc[NdItem.get_global_id()].uniform_group_range =
+            SG.get_uniform_group_range();
+      });
+    });
+    auto syclacc = syclbuf.get_access<access::mode::read_write>();
+    for (int j = 0; j < G; j++) {
+      exit_if_not_equal(syclacc[j].local_id, oclacc[j].local_id, "local_id");
+      exit_if_not_equal(syclacc[j].local_range, oclacc[j].local_range,
+                        "local_range");
+      exit_if_not_equal(syclacc[j].max_local_range, oclacc[j].max_local_range,
+                        "max_local_range");
+      exit_if_not_equal(syclacc[j].group_id, oclacc[j].group_id, "group_id");
+      exit_if_not_equal(syclacc[j].group_range, oclacc[j].group_range,
+                        "group_range");
+      exit_if_not_equal(syclacc[j].uniform_group_range,
+                        oclacc[j].uniform_group_range, "uniform_group_range");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  check(Queue, 240, 80);
+  check(Queue, 8, 4);
+  check(Queue, 24, 12);
+  check(Queue, 1024, 256);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/sycl/test/sub_group/helper.hpp b/sycl/test/sub_group/helper.hpp
new file mode 100644
index 000000000000..1a786c5b639e
--- /dev/null
+++ b/sycl/test/sub_group/helper.hpp
@@ -0,0 +1,41 @@
+//==---------- helper.hpp - SYCL sub_group helper functions ----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include <CL/sycl.hpp>
+#include <iostream>
+
+using namespace cl::sycl;
+template <typename T> void exit_if_not_equal(T val, T ref, const char *name) {
+  if (std::fabs(val - ref) > 0.01) {
+    std::cout << "Unexpected result for " << name << ": " << val
+              << " expected value: " << ref << std::endl;
+    exit(1);
+  }
+}
+/* CPU returns max number of SG, GPU returns mux SG size for
+ * CL_DEVICE_MAX_NUM_SUB_GROUPS device parameter. This function aligns the
+ * value.
+ * */
+inline size_t get_sg_size(const device &Device) {
+  size_t max_num_sg = Device.get_info<info::device::max_num_sub_groups>();
+  if (Device.get_info<info::device::device_type>() == info::device_type::cpu) {
+    size_t max_wg_size = Device.get_info<info::device::max_work_group_size>();
+    return max_wg_size / max_num_sg;
+  }
+  if (Device.get_info<info::device::device_type>() == info::device_type::gpu) {
+    return max_num_sg;
+  }
+  std::cout << "Unexpected deive type" << std::endl;
+  exit(1);
+}
+
+bool core_sg_supported(const device &Device) {
+  return (Device.has_extension("cl_khr_subgroups") ||
+          Device.get_info<info::device::version>().find(" 2.1") !=
+              string_class::npos);
+}
diff --git a/sycl/test/sub_group/info.cpp b/sycl/test/sub_group/info.cpp
new file mode 100644
index 000000000000..ccbb97f07ea1
--- /dev/null
+++ b/sycl/test/sub_group/info.cpp
@@ -0,0 +1,120 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==------------- info.cpp - SYCL sub_group parameters test ----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+class kernel_sg;
+using namespace cl::sycl;
+
+int main() {
+
+  queue Queue;
+  device Device = Queue.get_device();
+  /* Check info::device parameters. */
+  if (Queue.is_host()) {
+    try {
+      Device.get_info<info::device::max_num_sub_groups>();
+      std::cout << "Expected exception has not been generated\n";
+      return 1;
+    } catch (runtime_error e) {
+      /* Expected exception. */
+      try {
+        Device.get_info<info::device::sub_group_independent_forward_progress>();
+        std::cout << "Expected exception has not been generated\n";
+        return 1;
+      } catch (runtime_error e) {
+        /* Expected exception - do nothing. */
+      }
+    }
+
+  } else {
+    Device.get_info<info::device::sub_group_independent_forward_progress>();
+    Device.get_info<info::device::max_num_sub_groups>();
+
+    /* Basic sub-group functionality is supported as part of cl_khr_subgrou
+     * extension or as core OpenCL 2.1 feature. */
+    if (!core_sg_supported(Device)) {
+      std::cout << "Skipping test\n";
+      return 0;
+    }
+    try {
+      size_t max_sg_num = get_sg_size(Device);
+      size_t max_wg_size = Device.get_info<info::device::max_work_group_size>();
+      program Prog(Queue.get_context());
+      /* TODO: replace with pure SYCL code when fixed problem with consumption 
+       * kernels defined using program objects on GPU device
+      Prog.build_with_kernel_type<kernel_sg>();
+      kernel Kernel = Prog.get_kernel<kernel_sg>();
+
+      Queue.submit([&](cl::sycl::handler &cgh) {
+        cgh.parallel_for<kernel_sg>(
+            nd_range<2>(range<2>(50, 40), range<2>(10, 20)), Kernel,
+            [=](nd_item<2> index) {});
+      });*/
+      Prog.build_with_source("kernel void "
+                             "kernel_sg(global double* a, global double* b, "
+                             "global double* c) {*a=*b+*c; }\n");
+      kernel Kernel = Prog.get_kernel("kernel_sg");
+      size_t Res = 0;
+      for (auto r : {range<3>(3, 4, 5), range<3>(1, 1, 1), range<3>(4, 2, 1),
+                     range<3>(32, 3, 4), range<3>(7, 9, 11)}) {
+        Res = Kernel.get_sub_group_info<
+            info::kernel_sub_group::max_sub_group_size_for_ndrange>(Device, r);
+        exit_if_not_equal(Res, min(r.size(), max_sg_num),
+                          "max_sub_group_size_for_ndrange");
+        Res = Kernel.get_sub_group_info<
+            info::kernel_sub_group::sub_group_count_for_ndrange>(Device, r);
+        exit_if_not_equal<size_t>(
+            Res, r.size() / max_sg_num + (r.size() % max_sg_num ? 1 : 0),
+            "sub_group_count_for_ndrange");
+      }
+
+      Res = Kernel.get_sub_group_info<
+          info::kernel_sub_group::compile_num_sub_groups>(Device);
+
+      /* Sub-group size is not specified in kernel or IL*/
+      exit_if_not_equal<size_t>(Res, 0, "compile_num_sub_groups");
+
+      /* Check work-group sizea which can accommodate the requested number of
+       * sub-groups*/
+      for (auto s : {(size_t)200, (size_t)1, (size_t)3, (size_t)5, (size_t)7,
+                     (size_t)13, max_sg_num, max_sg_num + 1}) {
+        range<3> ResRange = Kernel.get_sub_group_info<
+            info::kernel_sub_group::local_size_for_sub_group_count>(Device, s);
+        if (s * max_sg_num <= max_wg_size) {
+          exit_if_not_equal<size_t>(ResRange[0], s * max_sg_num,
+                                    "local_size_for_sub_group_count[0]");
+          exit_if_not_equal<size_t>(ResRange[1], 1,
+                                    "local_size_for_sub_group_count[1]");
+          exit_if_not_equal<size_t>(ResRange[2], 1,
+                                    "local_size_for_sub_group_count[2]");
+
+        } else {
+          exit_if_not_equal<size_t>(ResRange[0], 0,
+                                    "local_size_for_sub_group_count[0]");
+          exit_if_not_equal<size_t>(ResRange[1], 0,
+                                    "local_size_for_sub_group_count[1]");
+          exit_if_not_equal<size_t>(ResRange[2], 0,
+                                    "local_size_for_sub_group_count[2]");
+        }
+      }
+    } catch (exception e) {
+      std::cout << "SYCL exception caught: " << e.what();
+      return 1;
+    }
+  }
+  std::cout << "Test passed.\n";
+  return 0;
+}
+
diff --git a/sycl/test/sub_group/load_store.cpp b/sycl/test/sub_group/load_store.cpp
new file mode 100644
index 000000000000..621cc71185b7
--- /dev/null
+++ b/sycl/test/sub_group/load_store.cpp
@@ -0,0 +1,204 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// TODO: Enable when use SPIRV operations instead direct built-ins calls.
+// RUNx: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==----------- load_store.cpp - SYCL sub_group load/store test ------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+template <typename T, int N> class sycl_subgr;
+
+using namespace cl::sycl;
+template <typename T1, int N> struct utils {
+  static T1 add_vec(const vec<T1, N> &v);
+};
+template <typename T2> struct utils<T2, 1> {
+  static T2 add_vec(const vec<T2, 1> &v) { return v.s0(); }
+};
+template <typename T2> struct utils<T2, 2> {
+  static T2 add_vec(const vec<T2, 2> &v) { return v.s0() + v.s1(); }
+};
+template <typename T2> struct utils<T2, 4> {
+  static T2 add_vec(const vec<T2, 4> &v) {
+    return v.s0() + v.s1() + v.s2() + v.s3();
+  }
+};
+template <typename T2> struct utils<T2, 8> {
+  static T2 add_vec(const vec<T2, 8> &v) {
+    return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() +
+           v.s7();
+  }
+};
+
+template <typename T, int N> void check(queue &Queue) {
+  const int G = 128, L = 128;
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> syclbuf(G);
+    buffer<size_t> sgsizebuf(1);
+    {
+      auto acc = syclbuf.template get_access<access::mode::read_write>();
+      for (int i = 0; i < G; i++) {
+        acc[i] = i;
+        acc[i] += 0.1; // Check that floating point types are not casted to int
+      }
+    }
+    Queue.submit([&](handler &cgh) {
+      auto acc = syclbuf.template get_access<access::mode::read_write>(cgh);
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<sycl_subgr<T, N>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        if (SG.get_group_id().get(0) % N == 0) {
+          if (NdItem.get_global_id(0) == 0)
+            sgsizeacc[0] = SG.get_max_local_range()[0];
+
+          size_t WGSGoffset =
+              NdItem.get_group(0) * L +
+              SG.get_group_id().get(0) * SG.get_max_local_range().get(0);
+          multi_ptr<T, access::address_space::global_space> mp(
+              &acc[WGSGoffset]);
+          // Add all values in read block
+          vec<T, N> v(utils<T, N>::add_vec(SG.load<N, T>(mp)));
+          SG.store<N, T>(mp, v);
+        }
+      });
+    });
+    auto acc = syclbuf.template get_access<access::mode::read_write>();
+    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+    size_t sg_size = sgsizeacc[0];
+    int WGid = -1, SGid = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+      }
+      if (j % L == 0) {
+        WGid++;
+        SGid = 0;
+      }
+      T ref = 0;
+      if (SGid % N) {
+        ref = acc[j - (SGid % N) * sg_size];
+      } else {
+        for (int i = 0; i < N; i++) {
+          ref += (T)(j + i * sg_size) + 0.1;
+        }
+      }
+      std::string s("Vector<");
+      s += std::string(typeid(ref).name()) + std::string(",") +
+           std::to_string(N) + std::string(">[") + std::to_string(j) +
+           std::string("]");
+      exit_if_not_equal<T>(acc[j], ref, s.c_str());
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+template <typename T> void check(queue &Queue) {
+  const int G = 128, L = 64;
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> syclbuf(G);
+    buffer<size_t> sgsizebuf(1);
+    {
+      auto acc = syclbuf.template get_access<access::mode::read_write>();
+      for (int i = 0; i < G; i++) {
+        acc[i] = i;
+        acc[i] += 0.1; // Check that floating point types are not casted to int
+      }
+    }
+
+    Queue.submit([&](handler &cgh) {
+      auto acc = syclbuf.template get_access<access::mode::read_write>(cgh);
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<sycl_subgr<T, 0>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        if (NdItem.get_global_id(0) == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+        size_t WGSGoffset =
+            NdItem.get_group(0) * L +
+            SG.get_group_id().get(0) * SG.get_max_local_range().get(0);
+        multi_ptr<T, access::address_space::global_space> mp(&acc[WGSGoffset]);
+        T s = SG.load<T>(mp) + (T)SG.get_local_id().get(0);
+        SG.store<T>(mp, s);
+      });
+    });
+    auto acc = syclbuf.template get_access<access::mode::read_write>();
+    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+    size_t sg_size = sgsizeacc[0];
+    int WGid = -1, SGid = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+      }
+      if (j % L == 0) {
+        WGid++;
+        SGid = 0;
+      }
+      std::string s("Scalar<");
+      s += std::string(typeid(acc[j]).name()) + std::string(">[") +
+           std::to_string(j) + std::string("]");
+
+      exit_if_not_equal<T>(acc[j], (T)(j + j % L % sg_size) + 0.1, s.c_str());
+    }
+
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+
+int main() {
+  queue Queue;
+  if (!Queue.get_device().has_extension("cl_intel_subgroups") &&
+      !Queue.get_device().has_extension("cl_intel_subgroups_short")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  if (Queue.get_device().has_extension("cl_intel_subgroups")) {
+    typedef int aligned_int __attribute__((aligned(16)));
+    check<aligned_int>(Queue);
+    check<aligned_int, 1>(Queue);
+    check<aligned_int, 2>(Queue);
+    check<aligned_int, 4>(Queue);
+    check<aligned_int, 8>(Queue);
+    typedef uint aligned_uint __attribute__((aligned(16)));
+    check<aligned_uint>(Queue);
+    check<aligned_uint, 1>(Queue);
+    check<aligned_uint, 2>(Queue);
+    check<aligned_uint, 4>(Queue);
+    check<aligned_uint, 8>(Queue);
+    typedef float aligned_float __attribute__((aligned(16)));
+    check<aligned_float>(Queue);
+    check<aligned_float, 1>(Queue);
+    check<aligned_float, 2>(Queue);
+    check<aligned_float, 4>(Queue);
+    check<aligned_float, 8>(Queue);
+  }
+  if (Queue.get_device().has_extension("cl_khr_fp16") &&
+      Queue.get_device().has_extension("cl_intel_subgroups_short")) {
+    typedef short aligned_short __attribute__((aligned(16)));
+    check<aligned_short>(Queue);
+    check<aligned_short, 1>(Queue);
+    check<aligned_short, 2>(Queue);
+    check<aligned_short, 4>(Queue);
+    check<aligned_short, 8>(Queue);
+    typedef half aligned_half __attribute__((aligned(16)));
+    check<aligned_half>(Queue);
+    check<aligned_half, 1>(Queue);
+    check<aligned_half, 2>(Queue);
+    check<aligned_half, 4>(Queue);
+    check<aligned_half, 8>(Queue);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/sycl/test/sub_group/reduce.cpp b/sycl/test/sub_group/reduce.cpp
new file mode 100644
index 000000000000..d6b3e31781e7
--- /dev/null
+++ b/sycl/test/sub_group/reduce.cpp
@@ -0,0 +1,97 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// TODO: Enable when use SPIRV operations instead direct built-ins calls.
+// RUNx: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==--------------- reduce.cpp - SYCL sub_group reduce test ----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+template <typename T> class sycl_subgr;
+using namespace cl::sycl;
+template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> minbuf(G);
+    buffer<T> maxbuf(G);
+    buffer<T> addbuf(G);
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      auto minacc = minbuf.template get_access<access::mode::read_write>(cgh);
+      auto maxacc = maxbuf.template get_access<access::mode::read_write>(cgh);
+      auto addacc = addbuf.template get_access<access::mode::read_write>(cgh);
+      auto sgsizeacc =
+          sgsizebuf.template get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<sycl_subgr<T>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group sg = NdItem.get_sub_group();
+        if (NdItem.get_global_id(0) == 0)
+          sgsizeacc[0] = sg.get_max_local_range()[0];
+        minacc[NdItem.get_global_id()] =
+            sg.reduce<T, intel::minimum>(NdItem.get_global_id(0));
+        maxacc[NdItem.get_global_id()] =
+            sg.reduce<T, intel::maximum>(NdItem.get_global_id(0));
+        addacc[NdItem.get_global_id()] =
+            sg.reduce<T, intel::plus>(NdItem.get_global_id(0));
+      });
+    });
+    auto minacc = minbuf.template get_access<access::mode::read_write>();
+    auto maxacc = maxbuf.template get_access<access::mode::read_write>();
+    auto addacc = addbuf.template get_access<access::mode::read_write>();
+    auto sgsizeacc = sgsizebuf.template get_access<access::mode::read_write>();
+    size_t sg_size = sgsizeacc[0];
+    int WGid = -1, SGid = 0;
+    T max = 0, add = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+        max = 0;
+        add = 0;
+        for (int i = j; (i % L && i % L % sg_size) || (i == j); i++) {
+          add += i;
+          max = i;
+        }
+      }
+      if (j % L == 0) {
+        WGid++;
+        SGid = 0;
+      }
+      exit_if_not_equal<T>(minacc[j], L * WGid + SGid * sg_size, "reduce_min");
+      exit_if_not_equal<T>(maxacc[j], max, "reduce_max");
+      exit_if_not_equal<T>(addacc[j], add, "reduce_add");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  /* Limit work-group size to avoid type overflow. */
+  check<char>(Queue, 120, 30);
+  check<short>(Queue);
+  check<int>(Queue);
+  check<uint>(Queue);
+  check<long>(Queue);
+  check<ulong>(Queue);
+  if (!Queue.get_device().has_extension("cl_khr_fp16")) {
+    check<half>(Queue);
+  }
+  check<float>(Queue);
+  if (!Queue.get_device().has_extension("cl_khr_fp64")) {
+    check<double>(Queue);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/sycl/test/sub_group/scan.cpp b/sycl/test/sub_group/scan.cpp
new file mode 100644
index 000000000000..a9eb942a67f1
--- /dev/null
+++ b/sycl/test/sub_group/scan.cpp
@@ -0,0 +1,123 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// TODO: Enable when use SPIRV operations instead direct built-ins calls.
+// RUNx: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==--------------- scan.cpp - SYCL sub_group scan test --------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+#include <limits>
+template <typename T> class sycl_subgr;
+using namespace cl::sycl;
+template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> minexbuf(G);
+    buffer<T> maxexbuf(G);
+    buffer<T> addexbuf(G);
+    buffer<T> mininbuf(G);
+    buffer<T> maxinbuf(G);
+    buffer<T> addinbuf(G);
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      auto minexacc =
+          minexbuf.template get_access<access::mode::read_write>(cgh);
+      auto maxexacc =
+          maxexbuf.template get_access<access::mode::read_write>(cgh);
+      auto addexacc =
+          addexbuf.template get_access<access::mode::read_write>(cgh);
+      auto mininacc =
+          mininbuf.template get_access<access::mode::read_write>(cgh);
+      auto maxinacc =
+          maxinbuf.template get_access<access::mode::read_write>(cgh);
+      auto addinacc =
+          addinbuf.template get_access<access::mode::read_write>(cgh);
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<sycl_subgr<T>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        if (NdItem.get_global_id(0) == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+        minexacc[NdItem.get_global_id()] =
+            SG.exclusive_scan<T, intel::minimum>(NdItem.get_global_id(0));
+        maxexacc[NdItem.get_global_id()] =
+            SG.exclusive_scan<T, intel::maximum>(NdItem.get_global_id(0));
+        addexacc[NdItem.get_global_id()] =
+            SG.exclusive_scan<T, intel::plus>(NdItem.get_global_id(0));
+        mininacc[NdItem.get_global_id()] =
+            SG.inclusive_scan<T, intel::minimum>(NdItem.get_global_id(0));
+        maxinacc[NdItem.get_global_id()] =
+            SG.inclusive_scan<T, intel::maximum>(NdItem.get_global_id(0));
+        addinacc[NdItem.get_global_id()] =
+            SG.inclusive_scan<T, intel::plus>(NdItem.get_global_id(0));
+      });
+    });
+    auto minexacc = minexbuf.template get_access<access::mode::read_write>();
+    auto maxexacc = maxexbuf.template get_access<access::mode::read_write>();
+    auto addexacc = addexbuf.template get_access<access::mode::read_write>();
+    auto mininacc = mininbuf.template get_access<access::mode::read_write>();
+    auto maxinacc = maxinbuf.template get_access<access::mode::read_write>();
+    auto addinacc = addinbuf.template get_access<access::mode::read_write>();
+
+    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+    size_t sg_size = sgsizeacc[0];
+    int WGid = -1, SGid = 0;
+    T add = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+        add = 0;
+      }
+      if (j % L == 0) {
+        WGid++;
+        SGid = 0;
+      }
+      /*skip check for empty array*/
+      if (j % L % sg_size != 0) {
+        exit_if_not_equal<T>(minexacc[j], L * WGid + SGid * sg_size,
+                             "scan_exc_min");
+        exit_if_not_equal<T>(maxexacc[j], j - 1, "scan_exc_max");
+      }
+      exit_if_not_equal<T>(addexacc[j], add, "scan_exc_add");
+      add += j;
+      exit_if_not_equal<T>(mininacc[j], L * WGid + SGid * sg_size,
+                           "scan_inc_min");
+      exit_if_not_equal<T>(maxinacc[j], j, "scan_inc_max");
+      exit_if_not_equal<T>(addinacc[j], add, "scan_inc_add");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  /* Limit work-group size to avoid type overflow. */
+  check<char>(Queue, 120, 30);
+  check<short>(Queue);
+  check<int>(Queue);
+  check<uint>(Queue);
+  check<long>(Queue);
+  check<ulong>(Queue);
+  if (!Queue.get_device().has_extension("cl_khr_fp16")) {
+    check<half>(Queue);
+  }
+  check<float>(Queue);
+  if (!Queue.get_device().has_extension("cl_khr_fp64")) {
+    check<double>(Queue);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/sycl/test/sub_group/shuffle.cpp b/sycl/test/sub_group/shuffle.cpp
new file mode 100644
index 000000000000..0b0db7ca073a
--- /dev/null
+++ b/sycl/test/sub_group/shuffle.cpp
@@ -0,0 +1,144 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// TODO: Enable when use SPIRV operations instead direct built-ins calls.
+// RUNx: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==------------ shuffle.cpp - SYCL sub_group shuffle test ----------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+template <typename T> class sycl_subgr;
+using namespace cl::sycl;
+template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> buf2(G);
+    buffer<T> buf2_up(G);
+    buffer<T> buf2_down(G);
+    buffer<T> buf(G);
+    buffer<T> buf_up(G);
+    buffer<T> buf_down(G);
+    buffer<T> buf_xor(G);
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      auto acc2 = buf2.template get_access<access::mode::read_write>(cgh);
+      auto acc2_up = buf2_up.template get_access<access::mode::read_write>(cgh);
+      auto acc2_down =
+          buf2_down.template get_access<access::mode::read_write>(cgh);
+
+      auto acc = buf.template get_access<access::mode::read_write>(cgh);
+      auto acc_up = buf_up.template get_access<access::mode::read_write>(cgh);
+      auto acc_down =
+          buf_down.template get_access<access::mode::read_write>(cgh);
+      auto acc_xor = buf_xor.template get_access<access::mode::read_write>(cgh);
+
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<sycl_subgr<T>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        if (NdItem.get_global_id(0) == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+        /* 1 for odd subgroups and 2 for even*/
+        acc2[NdItem.get_global_id()] = SG.shuffle<T>(
+            1, 2,
+            (SG.get_group_id().get(0) % 2) ? 1 : SG.get_max_local_range()[0]);
+        /* GID-SGID */
+        acc2_up[NdItem.get_global_id()] =
+            SG.shuffle_up<T>(NdItem.get_global_id(0), NdItem.get_global_id(0),
+                             SG.get_group_id().get(0));
+        /* GID-SGID or SGLID if GID+SGID > SGsize*/
+        acc2_down[NdItem.get_global_id()] = SG.shuffle_down<T>(
+            NdItem.get_global_id(0), SG.get_local_id().get(0),
+            SG.get_group_id().get(0));
+
+        /*GID of middle element in every subgroup*/
+        acc[NdItem.get_global_id()] = SG.shuffle<T>(
+            NdItem.get_global_id(0), SG.get_max_local_range()[0] / 2);
+        /* Save GID-SGID */
+        acc_up[NdItem.get_global_id()] =
+            SG.shuffle_up<T>(NdItem.get_global_id(0), SG.get_group_id().get(0));
+        /* Save GID+SGID */
+        acc_down[NdItem.get_global_id()] = SG.shuffle_down<T>(
+            NdItem.get_global_id(0), SG.get_group_id().get(0));
+        /* Save GID XOR SGID */
+        acc_xor[NdItem.get_global_id()] =
+            SG.shuffle_xor<T>(NdItem.get_global_id(0), SG.get_group_id());
+      });
+    });
+    auto acc = buf.template get_access<access::mode::read_write>();
+    auto acc_up = buf_up.template get_access<access::mode::read_write>();
+    auto acc_down = buf_down.template get_access<access::mode::read_write>();
+    auto acc2 = buf2.template get_access<access::mode::read_write>();
+    auto acc2_up = buf2_up.template get_access<access::mode::read_write>();
+    auto acc2_down = buf2_down.template get_access<access::mode::read_write>();
+    auto acc_xor = buf_xor.template get_access<access::mode::read_write>();
+
+    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+    size_t sg_size = sgsizeacc[0];
+    int SGid = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+      }
+      if (j % L == 0) {
+        SGid = 0;
+      }
+      /*GID of middle element in every subgroup*/
+      exit_if_not_equal<T>(acc[j], j / L * L + SGid * sg_size + sg_size / 2,
+                           "shuffle");
+      /* 1 for odd subgroups and 2 for even*/
+      exit_if_not_equal<T>(acc2[j], (SGid % 2) ? 1 : 2, "shuffle2");
+      /* Value GID+SGID for all element except last SGID in SG*/
+      if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) {
+        exit_if_not_equal<T>(acc_down[j], j + SGid, "shuffle_down");
+        exit_if_not_equal<T>(acc2_down[j], j + SGid, "shuffle2_down");
+      } else {                /* SGLID for GID+SGid */
+        if (j % L + SGid < L) /* Do not go out  LG*/
+          exit_if_not_equal<T>(acc2_down[j], (j + SGid) % L % sg_size,
+                               "shuffle2_down");
+      }
+      /* Value GID-SGID for all element except first SGID in SG*/
+      if (j % L % sg_size >= SGid) {
+        exit_if_not_equal<T>(acc_up[j], j - SGid, "shuffle_up");
+        exit_if_not_equal<T>(acc2_up[j], j - SGid, "shuffle2_up");
+      } else {                          /* SGLID for GID-SGid */
+        if (j % L - SGid + sg_size < L) /* Do not go out  LG*/
+          exit_if_not_equal<T>(acc2_up[j], j - SGid + sg_size, "shuffle2_up");
+      }
+      /* GID XOR SGID */
+      exit_if_not_equal<T>(acc_xor[j], j ^ SGid, "shuffle_xor");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main() {
+  queue Queue;
+  if (!Queue.get_device().has_extension("cl_intel_subgroups")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check<char>(Queue);
+  check<short>(Queue);
+  check<int>(Queue);
+  check<uint>(Queue);
+  check<long>(Queue);
+  check<ulong>(Queue);
+  if (!Queue.get_device().has_extension("cl_khr_fp16")) {
+    check<half>(Queue);
+  }
+  check<float>(Queue);
+  if (!Queue.get_device().has_extension("cl_khr_fp64")) {
+    check<double>(Queue);
+  }
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/sycl/test/sub_group/vote.cpp b/sycl/test/sub_group/vote.cpp
new file mode 100644
index 000000000000..a11343a857db
--- /dev/null
+++ b/sycl/test/sub_group/vote.cpp
@@ -0,0 +1,68 @@
+// RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// TODO: Enable when use SPIRV operations instead direct built-ins calls.
+// RUNx: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//==--------------- vote.cpp - SYCL sub_group vote test --------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+using namespace cl::sycl;
+
+void check(queue Queue, const int G, const int L, const int D, const int R) {
+  try {
+    int max_sg =
+        Queue.get_device().get_info<info::device::max_num_sub_groups>();
+    int num_sg = (L) / max_sg + ((L) % max_sg ? 1 : 0);
+    range<1> GRange(G), LRange(L);
+    nd_range<1> NdRange(GRange, LRange);
+    buffer<int, 1> sganybuf(G);
+    buffer<int, 1> sgallbuf(G);
+
+    Queue.submit([&](handler &cgh) {
+      auto sganyacc = sganybuf.get_access<access::mode::read_write>(cgh);
+      auto sgallacc = sgallbuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<class subgr>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        /* Set to 1 if any local ID in subgroup devided by D has remainder R */
+        if (SG.any(SG.get_local_id().get(0) % D == R)) {
+          sganyacc[NdItem.get_global_id()]++;
+        }
+        /* Set to 1 if remainder of division of subgroup local ID by D is less
+         * than R for all work items in subgroup */
+        if (SG.all(SG.get_local_id().get(0) % D < R)) {
+          sgallacc[NdItem.get_global_id()]++;
+        }
+      });
+    });
+    auto sganyacc = sganybuf.get_access<access::mode::read_write>();
+    auto sgallacc = sgallbuf.get_access<access::mode::read_write>();
+    for (int j = 0; j < G; j++) {
+      exit_if_not_equal(sganyacc[j], (int)(D > R), "any");
+      exit_if_not_equal(sgallacc[j], (int)(D <= R), "all");
+    }
+
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check(Queue, 240, 80, 9, 8);
+  check(Queue, 24, 12, 9, 10);
+  check(Queue, 1024, 256, 9, 8);
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/tools/CMakeLists.txt b/sycl/tools/CMakeLists.txt
new file mode 100644
index 000000000000..7ef7457fa177
--- /dev/null
+++ b/sycl/tools/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+find_package(OpenCL REQUIRED)
+
+# All projects need this include directory
+include_directories(${OpenCL_INCLUDE_DIRS})
+
+link_libraries(OpenCL)
+
+add_executable(get_device_count_by_type get_device_count_by_type.cpp)
+
+#Minimum supported version of Intel's OCL GPU and CPU devices
+add_definitions(-D MIN_INTEL_OCL_GPU_VERSION=\\"18.47.11882\\")
+add_definitions(-D MIN_INTEL_OCL_CPU_VERSION=\\"18.1.0.0901\\",\\"7.6.0.1202\\")
+
+add_executable(sycl-check sycl-check.cpp)
+target_link_libraries(sycl-check sycl)
diff --git a/sycl/tools/get_device_count_by_type.cpp b/sycl/tools/get_device_count_by_type.cpp
new file mode 100644
index 000000000000..b803ddba0b0e
--- /dev/null
+++ b/sycl/tools/get_device_count_by_type.cpp
@@ -0,0 +1,80 @@
+//==-- get_device_count_by_type.cpp - Get device count by type -------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+static const std::string help =
+"   Help\n"
+"   Example: ./get_device_count_by_type cpu\n"
+"   Support types: cpu/gpu/accelerator/default/all\n"
+"   Output format: <number_of_devices>:<additional_Information>";
+
+int main(int argc, char* argv[]) {
+    if (argc <= 1) {
+        std::cout << "0:Please set a device type for find" << std::endl
+            << help << std::endl;
+        return 0;
+    }
+
+    std::string type = argv[1];
+    cl_device_type device_type;
+    if (type == "cpu") {
+        device_type = CL_DEVICE_TYPE_CPU;
+    } else if (type == "gpu") {
+        device_type = CL_DEVICE_TYPE_GPU;
+    } else if (type == "accelerator") {
+        device_type = CL_DEVICE_TYPE_ACCELERATOR;
+    } else if (type == "default") {
+        device_type = CL_DEVICE_TYPE_DEFAULT;
+    } else if (type == "all") {
+        device_type = CL_DEVICE_TYPE_ALL;
+    } else  {
+        std::cout << "0:Incorrect device type." << std::endl
+            << help << std::endl;
+        return 0;
+    }
+
+    cl_int iRet = CL_SUCCESS;
+    cl_uint platformCount = 0;
+
+    iRet = clGetPlatformIDs(0, nullptr, &platformCount);
+    if (iRet != CL_SUCCESS) {
+        if (iRet == CL_PLATFORM_NOT_FOUND_KHR) {
+            std::cout << "0:OpenCL runtime not found " << std::endl;
+        } else {
+            std::cout << "0:A problem at calling function clGetPlatformIDs count "
+                << iRet << std::endl;
+        }
+        return 0;
+    }
+
+    std::vector<cl_platform_id> platforms(platformCount);
+
+    iRet = clGetPlatformIDs(platformCount, &platforms[0], nullptr);
+    if (iRet != CL_SUCCESS) {
+        std::cout << "0:A problem at when calling function clGetPlatformIDs ids " << iRet << std::endl;
+        return 0;
+    }
+
+    cl_uint deviceCount = 0;
+    for (cl_uint i = 0; i < platformCount; i++) {
+        cl_uint deviceCountPart = 0;
+        iRet = clGetDeviceIDs(platforms[i], device_type, 0, nullptr, &deviceCountPart);
+        if (iRet == CL_SUCCESS) {
+            deviceCount += deviceCountPart;
+        }
+    }
+
+    std::cout << deviceCount << ":" << std::endl;
+    return 0;
+}
diff --git a/sycl/tools/sycl-check.cpp b/sycl/tools/sycl-check.cpp
new file mode 100644
index 000000000000..d55e8170830b
--- /dev/null
+++ b/sycl/tools/sycl-check.cpp
@@ -0,0 +1,300 @@
+//==----------- sycl-check.cpp ---------------------------------------------==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace cl;
+
+// The base class every class that want to perform some action on
+// cl::sycl::device.
+class Action {
+public:
+  // This function will be called for each cl::sycl::device
+  // Indentation should be printed as a beginning of each line the method prints
+  virtual void visit(const sycl::device &Device,
+                     const std::string &Indentation) = 0;
+  virtual ~Action() = default;
+};
+
+// The class prints if the device is a default device of this type
+class PrintIfDefaultDevice : public Action {
+  // Contains default device of each type
+  // Using cl_device_id as SYCL implementation always create new
+  // cl::sycl::device object instead of reusing existing one.
+  std::vector<cl_device_id> m_DefaultDevices;
+
+public:
+  PrintIfDefaultDevice() {
+    // Fill vector of default devices for comparing in future
+    try {
+      sycl::cpu_selector CPUSelector;
+      m_DefaultDevices.push_back(sycl::device(CPUSelector).get());
+    } catch (cl::sycl::invalid_parameter_error &) {
+    }
+
+    try {
+      sycl::gpu_selector GPUSelector;
+      m_DefaultDevices.push_back(sycl::device(GPUSelector).get());
+    } catch (cl::sycl::invalid_parameter_error &) {
+    }
+
+    try {
+      sycl::accelerator_selector AcceleratorSelector;
+      m_DefaultDevices.push_back(sycl::device(AcceleratorSelector).get());
+    } catch (cl::sycl::invalid_parameter_error &) {
+    }
+  }
+
+  void visit(const sycl::device &Device,
+             const std::string &Indentation) override {
+    auto DefaultIt = std::find(m_DefaultDevices.begin(), m_DefaultDevices.end(),
+                               Device.get());
+
+    if (DefaultIt != m_DefaultDevices.end())
+      std::cout << Indentation
+                << "NOTE! The device is a DEFAULT device of this type"
+                << std::endl;
+  }
+};
+
+// The class prints generic info about the device
+class PrintGenericInfo : public Action {
+  std::string convertDeviceType2String(sycl::info::device_type DeviceType) {
+    switch (DeviceType) {
+    case sycl::info::device_type::cpu:
+      return std::string("CPU");
+      break;
+    case sycl::info::device_type::gpu:
+      return std::string("GPU");
+      break;
+    case sycl::info::device_type::accelerator:
+      return std::string("ACCELERATOR");
+      break;
+    case sycl::info::device_type::custom:
+      return std::string("CUSTOM");
+      break;
+    case sycl::info::device_type::all:
+    case sycl::info::device_type::host:
+    case sycl::info::device_type::automatic:
+    default:
+      assert(!"Should be concrete OpenCL device");
+      return std::string("UNKNOWN");
+      break;
+    };
+  }
+
+public:
+  PrintGenericInfo() = default;
+  void visit(const sycl::device &Device,
+             const std::string &Indentation) override {
+
+    const sycl::info::device_type DeviceType =
+        Device.get_info<sycl::info::device::device_type>();
+
+    const std::string DeviceName = Device.get_info<sycl::info::device::name>();
+
+    const std::string DeviceVendor =
+        Device.get_info<sycl::info::device::vendor>();
+
+    const std::string DeviceDriverVersion =
+        Device.get_info<sycl::info::device::driver_version>();
+
+    std::cout << Indentation
+              << "Type            : " << convertDeviceType2String(DeviceType)
+              << std::endl;
+
+    std::cout << Indentation << "Name            : " << DeviceName << std::endl;
+    std::cout << Indentation << "Vendor          : " << DeviceVendor
+              << std::endl;
+    std::cout << Indentation << "Driver version  : " << DeviceDriverVersion
+              << std::endl;
+  }
+};
+
+// The class prints warning if the device is not tested or the driver version
+// is too low
+class PrintIfDeviceSupported : public Action {
+
+  // Convert std::string "42.13.53" to std::vector<size_t> {42, 13, 53}
+  std::vector<size_t> convertToInts(const std::string SourceString) {
+    std::vector<size_t> Result;
+    std::stringstream SStream(SourceString);
+    size_t Value = 0;
+
+    while (SStream >> Value) {
+      Result.push_back(Value);
+
+      if (SStream.peek() == '.')
+        SStream.ignore();
+    }
+    return Result;
+  }
+
+  void checkDriverVersion(const std::vector<std::string> &RefVersionsStr,
+                          const std::string &CurVersionStr,
+                          const std::string &Indentation) {
+
+    // Convert to vector of integers
+    const std::vector<size_t> CurVersion = convertToInts(CurVersionStr);
+
+    for (const std::string &RefVersionStr : RefVersionsStr) {
+
+      // Convert to vector of integers
+      const std::vector<size_t> RefVersion = convertToInts(RefVersionStr);
+
+      // Check branch version
+      if (CurVersion[0] != RefVersion[0]) {
+        continue;
+      }
+
+      // Check sizes
+      bool CheckPassed = CurVersion.size() == RefVersion.size();
+      // Checking version going from major version to minor
+      for (size_t I = 1; CheckPassed && I < RefVersion.size(); ++I)
+        if (CurVersion[I] < RefVersion[I])
+          CheckPassed = false;
+
+      if (false == CheckPassed) {
+        std::cout << Indentation << "WARNING! The device driver version it too "
+                                    "low and not supported."
+                  << std::endl;
+        std::cout
+            << Indentation
+            << "NOTE! The minimum supported driver version for this device is "
+            << RefVersionStr << std::endl;
+      }
+      return;
+    }
+
+    std::cout << Indentation
+              << "WARNING! The device driver version is unrecognized"
+              << std::endl;
+  }
+
+public:
+  PrintIfDeviceSupported() = default;
+  void visit(const sycl::device &Device,
+             const std::string &Indentation) override {
+
+    const std::string IntelName("Intel");
+    const std::vector<std::string> MinIntelOCLGPUVersion = {
+        MIN_INTEL_OCL_GPU_VERSION};
+    const std::vector<std::string> MinIntelOCLCPUVersion = {
+        MIN_INTEL_OCL_CPU_VERSION};
+
+    const sycl::info::device_type DeviceType =
+        Device.get_info<sycl::info::device::device_type>();
+    const std::string DeviceName = Device.get_info<sycl::info::device::name>();
+    const std::string DeviceDriverVersion =
+        Device.get_info<sycl::info::device::driver_version>();
+
+    // If Intel's device
+    if (DeviceName.find(IntelName) != std::string::npos)
+      switch (DeviceType) {
+      case sycl::info::device_type::cpu:
+        checkDriverVersion(MinIntelOCLCPUVersion, DeviceDriverVersion,
+                           Indentation);
+        return;
+      case sycl::info::device_type::gpu:
+        checkDriverVersion(MinIntelOCLGPUVersion, DeviceDriverVersion,
+                           Indentation);
+        return;
+      case sycl::info::device_type::accelerator:
+        std::cout << Indentation
+                  << "WARNING! The device is not officially supported."
+                  << std::endl;
+        return;
+      case sycl::info::device_type::custom:
+      case sycl::info::device_type::all:
+      case sycl::info::device_type::host:
+      case sycl::info::device_type::automatic:
+      default:
+        assert(!"Should be concrete OpenCL device");
+        return;
+      }
+    // Non-Intel devices were not tested
+    std::cout << Indentation
+              << "WARNING! The device is not officially supported."
+              << std::endl;
+  }
+};
+
+// The class prints if the device is a default device of this type
+class CheckSPIRVSupport : public Action {
+public:
+  void visit(const sycl::device &Device,
+             const std::string &Indentation) override {
+
+    const std::vector<size_t> MinimumDeviceVersion = {2, 1};
+
+    const std::string DeviceVersionStr =
+        Device.get_info<sycl::info::device::version>();
+    std::vector<size_t> DeviceVersion = {0, 0};
+
+    if (!DeviceVersionStr.compare(0, 10, "OpenCL 2.2"))
+      DeviceVersion = {2, 2};
+    else if (!DeviceVersionStr.compare(0, 10, "OpenCL 2.1"))
+      DeviceVersion = {2, 1};
+    else if (!DeviceVersionStr.compare(0, 10, "OpenCL 2.0"))
+      DeviceVersion = {2, 0};
+    else if (!DeviceVersionStr.compare(0, 10, "OpenCL 1.2"))
+      DeviceVersion = {1, 2};
+    else if (!DeviceVersionStr.compare(0, 10, "OpenCL 1.0"))
+      DeviceVersion = {1, 0};
+
+    if (DeviceVersion[0] < MinimumDeviceVersion[0] ||
+        DeviceVersion[1] < MinimumDeviceVersion[1]) {
+      std::cout << Indentation
+                << "WARNING! Device doesn't support SPIRV format." << std::endl;
+      return;
+    }
+  }
+};
+
+int main() {
+  try {
+    std::vector<std::unique_ptr<Action>> Actions;
+    // Fill vector of actions to be performed on each device
+    // Note! Actions are performed in order they are placed in vector
+    Actions.emplace_back(new PrintGenericInfo());
+    Actions.emplace_back(new PrintIfDefaultDevice());
+    Actions.emplace_back(new PrintIfDeviceSupported());
+    Actions.emplace_back(new CheckSPIRVSupport());
+
+    std::cout << "Available OpenCL devices:" << std::endl;
+    size_t DeviceNumber = 0;
+    for (const sycl::device &Device : sycl::device::get_devices()) {
+
+      // SYCL host device is not OpenCL device, skipping...
+      if (Device.get_info<sycl::info::device::device_type>() ==
+          sycl::info::device_type::host)
+        continue;
+
+      std::cout << "Device [" << DeviceNumber << "]:" << std::endl;
+
+      const std::string Tab("    ");
+      for (std::unique_ptr<Action> &Act : Actions) {
+        Act->visit(Device, Tab);
+      }
+      ++DeviceNumber;
+    }
+  } catch (...) {
+    std::cout << "Unhandled error happened." << std::endl;
+    return 1;
+  }
+  return 0;
+}