diff --git a/CMakeLists.txt b/CMakeLists.txt index cbadde91e4..0264f1a7cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,7 +104,9 @@ if (LLAMA_BUILD_EXAMPLES) check_language(CUDA) if (CMAKE_CUDA_COMPILER) enable_language(CUDA) + set(CMAKE_CUDA_ARCHITECTURES "35" CACHE STRING "CUDA architectures to compile for") add_subdirectory("examples/cuda/nbody") + add_subdirectory("examples/cuda/pitch") elseif() message(WARNING "Could not find CUDA. Try setting CMAKE_CUDA_COMPILER. CUDA examples are disabled.") endif() diff --git a/examples/cuda/nbody/CMakeLists.txt b/examples/cuda/nbody/CMakeLists.txt index 55e39d96a3..c340740554 100644 --- a/examples/cuda/nbody/CMakeLists.txt +++ b/examples/cuda/nbody/CMakeLists.txt @@ -1,6 +1,5 @@ cmake_minimum_required (VERSION 3.18.3) project(llama-cuda-nbody CXX CUDA) -set(CMAKE_CUDA_ARCHITECTURES "35" CACHE STRING "CUDA architectures to compile for") find_package(CUDAToolkit) # for include directories find_package(fmt CONFIG REQUIRED) diff --git a/examples/cuda/pitch/CMakeLists.txt b/examples/cuda/pitch/CMakeLists.txt new file mode 100644 index 0000000000..bc507340a1 --- /dev/null +++ b/examples/cuda/pitch/CMakeLists.txt @@ -0,0 +1,13 @@ +cmake_minimum_required (VERSION 3.18.3) +project(llama-cuda-pitch CXX CUDA) + +find_package(CUDAToolkit) # for include directories +find_package(fmt CONFIG REQUIRED) +if (NOT TARGET llama::llama) + find_package(llama REQUIRED) +endif() +add_executable(${PROJECT_NAME} pitch.cu) +target_compile_features(${PROJECT_NAME} PRIVATE cuda_std_17) +target_compile_options(${PROJECT_NAME} PUBLIC $<$:--expt-extended-lambda --expt-relaxed-constexpr>) +target_include_directories(${PROJECT_NAME} SYSTEM PRIVATE ../../../thirdparty/stb/include) +target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama CUDA::cudart fmt::fmt) \ No newline at end of file diff --git a/examples/cuda/pitch/pitch.cu b/examples/cuda/pitch/pitch.cu new file mode 100644 index 0000000000..3712fdb2e3 --- /dev/null +++ b/examples/cuda/pitch/pitch.cu @@ -0,0 +1,192 @@ +#define STB_IMAGE_WRITE_IMPLEMENTATION + +#include +#include +#include +#include +#include +#include + +using namespace std::literals; + +struct RGB +{ + unsigned char r, g, b; + + friend auto operator==(RGB a, RGB b) -> bool + { + return a.r == b.r && a.g == b.g && a.b == b.b; + } +}; + +void checkError(cudaError_t code) +{ + if(code != cudaSuccess) + throw std::runtime_error("CUDA Error: "s + cudaGetErrorString(code)); +} + +template +__global__ void init(View view, ArrayExtents extents) +{ + const auto x = blockIdx.x * blockDim.x + threadIdx.x; + const auto y = blockIdx.y * blockDim.y + threadIdx.y; + if(y >= extents[0] || x >= extents[1]) + return; + + view(y, x).r = x * 255 / static_cast(blockDim.x * gridDim.x); + view(y, x).g = y * 255 / static_cast(blockDim.y * gridDim.y); + view(y, x).b = (threadIdx.x + threadIdx.y) * 255 / static_cast(blockDim.x + blockDim.y); +} + +namespace llamaex +{ + using namespace llama; + + template + LLAMA_FN_HOST_ACC_INLINE constexpr auto pitchesFromExtents(ArrayExtents extents) + { + constexpr std::size_t dim = ArrayExtents{}.size(); + Array pitches{}; + pitches[dim - 1] = sizeOf; + for(auto i = dim - 1; i > 0; --i) + pitches[i - 1] = pitches[i] * extents[i - 1]; + return pitches; + } + + template< + typename TArrayExtents, + typename TRecordDim, + bool AlignAndPad = true, + template typename FlattenRecordDim = mapping::FlattenRecordDimInOrder> + struct PitchedAoS : mapping::MappingBase + { + private: + static constexpr std::size_t dim = TArrayExtents{}.size(); + + using Base = mapping::MappingBase; + using Flattener = FlattenRecordDim; + + Array pitches; + + public: + static constexpr std::size_t blobCount = 1; + + LLAMA_FN_HOST_ACC_INLINE constexpr PitchedAoS(TArrayExtents extents, Array pitches) + : Base(extents) + , pitches(pitches) + { + } + + LLAMA_FN_HOST_ACC_INLINE constexpr PitchedAoS(TArrayExtents extents, std::size_t rowPitch) + : Base(extents) + , pitches(pitchesFromExtents(extents)) + { + static_assert(dim >= 2, "The rowPitch constructor is only available for 2D or higher dimensions"); + pitches[dim - 2] = rowPitch; + } + + LLAMA_FN_HOST_ACC_INLINE constexpr PitchedAoS( + TArrayExtents extents, + std::size_t rowPitch, + std::size_t slicePitch) + : Base(extents) + , pitches(pitchesFromExtents(extents)) + { + static_assert( + dim >= 3, + "The rowPitch/slicePitch constructor is only available for 3D or higher dimensions"); + pitches[dim - 2] = rowPitch; + pitches[dim - 3] = slicePitch; + } + + LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(std::size_t) const -> std::size_t + { + return pitches[0] * Base::extents()[0]; + } + + template + LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset( + typename Base::ArrayIndex ai, + RecordCoord = {}) const -> NrAndOffset + { + constexpr std::size_t flatFieldIndex = +#ifdef __NVCC__ + *& // mess with nvcc compiler state to workaround bug +#endif + Flattener::template flatIndex; + const auto offset + = dot(pitches, ai) + flatOffsetOf; + return {0, offset}; + } + }; +} // namespace llamaex + +auto main() -> int +try +{ + int device = 0; + checkError(cudaGetDevice(&device)); + cudaDeviceProp prop{}; + checkError(cudaGetDeviceProperties(&prop, device)); + fmt::print( + "Running on {}, {}MiB GM, {}kiB SM\n", + prop.name, + prop.totalGlobalMem / 1024 / 1024, + prop.sharedMemPerBlock / 1024); + + const auto extents = llama::ArrayExtents{600, 800}; // height, width + const auto widthBytes = extents[1] * sizeof(RGB); + + const auto blockDim = dim3{16, 32, 1}; + const auto gridDim = dim3{ + llama::divCeil(static_cast(extents[1]), blockDim.x), + llama::divCeil(static_cast(extents[0]), blockDim.y), + 1}; + + std::vector host1(llama::product(extents)); + std::vector host2(llama::product(extents)); + { + std::byte* mem = nullptr; + std::size_t rowPitch = 0; + checkError(cudaMallocPitch(&mem, &rowPitch, widthBytes, extents[0])); + fmt::print("Row pitch: {} B ({} B padding)\n", rowPitch, rowPitch - widthBytes); + + auto mapping = llamaex::PitchedAoS, RGB>{extents, rowPitch}; + assert(mapping.blobSize(0) == rowPitch * extents[0]); + auto view = llama::View{mapping, llama::Array{mem}}; + + init<<>>(view, extents); + + checkError(cudaMemcpy2D(host1.data(), widthBytes, mem, rowPitch, widthBytes, extents[0], cudaMemcpyDefault)); + checkError(cudaFree(mem)); + + stbi_write_png("pitch1.png", extents[1], extents[0], 3, host1.data(), 0); + } + + // nvcc 11.3 fails to compile the AoS mapping here +#if !(defined(__NVCC__) && __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 3) + { + std::byte* mem = nullptr; + checkError(cudaMalloc(&mem, widthBytes * extents[0])); + + auto mapping = llama::mapping::AoS{extents, RGB{}}; + auto view = llama::View{mapping, llama::Array{mem}}; + + init<<>>(view, extents); + + checkError(cudaMemcpy(host2.data(), mem, widthBytes * extents[0], cudaMemcpyDefault)); + checkError(cudaFree(mem)); + + stbi_write_png("pitch2.png", extents[1], extents[0], 3, host2.data(), 0); + } +#endif + + if(host1 != host2) + fmt::print("ERROR: produced two different images"); + + return 0; +} +catch(const std::exception& e) +{ + std::cerr << "Exception: " << e.what() << '\n'; +}