add new CUDA demo for pitched allocation

alpaka-group · Mar 1, 2022 · 7443349 · 7443349
1 parent c4b94b6
commit 7443349
Show file tree

Hide file tree

Showing 4 changed files with 207 additions and 1 deletion.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -104,7 +104,9 @@ if (LLAMA_BUILD_EXAMPLES)
 	check_language(CUDA)
 	if (CMAKE_CUDA_COMPILER)
 		enable_language(CUDA)
+		set(CMAKE_CUDA_ARCHITECTURES "35" CACHE STRING "CUDA architectures to compile for")
 		add_subdirectory("examples/cuda/nbody")
+		add_subdirectory("examples/cuda/pitch")
 	elseif()
 		message(WARNING "Could not find CUDA. Try setting CMAKE_CUDA_COMPILER. CUDA examples are disabled.")
 	endif()

diff --git a/examples/cuda/nbody/CMakeLists.txt b/examples/cuda/nbody/CMakeLists.txt
@@ -1,6 +1,5 @@
 cmake_minimum_required (VERSION 3.18.3)
 project(llama-cuda-nbody CXX CUDA)
-set(CMAKE_CUDA_ARCHITECTURES "35" CACHE STRING "CUDA architectures to compile for")
 
 find_package(CUDAToolkit) # for include directories
 find_package(fmt CONFIG REQUIRED)

diff --git a/examples/cuda/pitch/CMakeLists.txt b/examples/cuda/pitch/CMakeLists.txt
@@ -0,0 +1,13 @@
+cmake_minimum_required (VERSION 3.18.3)
+project(llama-cuda-pitch CXX CUDA)
+
+find_package(CUDAToolkit) # for include directories
+find_package(fmt CONFIG REQUIRED)
+if (NOT TARGET llama::llama)
+	find_package(llama REQUIRED)
+endif()
+add_executable(${PROJECT_NAME} pitch.cu)
+target_compile_features(${PROJECT_NAME} PRIVATE cuda_std_17)
+target_compile_options(${PROJECT_NAME} PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda --expt-relaxed-constexpr>)
+target_include_directories(${PROJECT_NAME} SYSTEM PRIVATE ../../../thirdparty/stb/include)
+target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama CUDA::cudart fmt::fmt)
diff --git a/examples/cuda/pitch/pitch.cu b/examples/cuda/pitch/pitch.cu
@@ -0,0 +1,192 @@
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+
+#include <cstddef>
+#include <cuda_runtime.h>
+#include <fmt/format.h>
+#include <llama/llama.hpp>
+#include <stb_image_write.h>
+#include <stdexcept>
+
+using namespace std::literals;
+
+struct RGB
+{
+    unsigned char r, g, b;
+
+    friend auto operator==(RGB a, RGB b) -> bool
+    {
+        return a.r == b.r && a.g == b.g && a.b == b.b;
+    }
+};
+
+void checkError(cudaError_t code)
+{
+    if(code != cudaSuccess)
+        throw std::runtime_error("CUDA Error: "s + cudaGetErrorString(code));
+}
+
+template<typename View, typename ArrayExtents>
+__global__ void init(View view, ArrayExtents extents)
+{
+    const auto x = blockIdx.x * blockDim.x + threadIdx.x;
+    const auto y = blockIdx.y * blockDim.y + threadIdx.y;
+    if(y >= extents[0] || x >= extents[1])
+        return;
+
+    view(y, x).r = x * 255 / static_cast<float>(blockDim.x * gridDim.x);
+    view(y, x).g = y * 255 / static_cast<float>(blockDim.y * gridDim.y);
+    view(y, x).b = (threadIdx.x + threadIdx.y) * 255 / static_cast<float>(blockDim.x + blockDim.y);
+}
+
+namespace llamaex
+{
+    using namespace llama;
+
+    template<typename RecordDim, typename ArrayExtents>
+    LLAMA_FN_HOST_ACC_INLINE constexpr auto pitchesFromExtents(ArrayExtents extents)
+    {
+        constexpr std::size_t dim = ArrayExtents{}.size();
+        Array<std::size_t, dim> pitches{};
+        pitches[dim - 1] = sizeOf<RecordDim>;
+        for(auto i = dim - 1; i > 0; --i)
+            pitches[i - 1] = pitches[i] * extents[i - 1];
+        return pitches;
+    }
+
+    template<
+        typename TArrayExtents,
+        typename TRecordDim,
+        bool AlignAndPad = true,
+        template<typename> typename FlattenRecordDim = mapping::FlattenRecordDimInOrder>
+    struct PitchedAoS : mapping::MappingBase<TArrayExtents, TRecordDim>
+    {
+    private:
+        static constexpr std::size_t dim = TArrayExtents{}.size();
+
+        using Base = mapping::MappingBase<TArrayExtents, TRecordDim>;
+        using Flattener = FlattenRecordDim<TRecordDim>;
+
+        Array<std::size_t, dim> pitches;
+
+    public:
+        static constexpr std::size_t blobCount = 1;
+
+        LLAMA_FN_HOST_ACC_INLINE constexpr PitchedAoS(TArrayExtents extents, Array<std::size_t, dim> pitches)
+            : Base(extents)
+            , pitches(pitches)
+        {
+        }
+
+        LLAMA_FN_HOST_ACC_INLINE constexpr PitchedAoS(TArrayExtents extents, std::size_t rowPitch)
+            : Base(extents)
+            , pitches(pitchesFromExtents<TRecordDim>(extents))
+        {
+            static_assert(dim >= 2, "The rowPitch constructor is only available for 2D or higher dimensions");
+            pitches[dim - 2] = rowPitch;
+        }
+
+        LLAMA_FN_HOST_ACC_INLINE constexpr PitchedAoS(
+            TArrayExtents extents,
+            std::size_t rowPitch,
+            std::size_t slicePitch)
+            : Base(extents)
+            , pitches(pitchesFromExtents<TRecordDim>(extents))
+        {
+            static_assert(
+                dim >= 3,
+                "The rowPitch/slicePitch constructor is only available for 3D or higher dimensions");
+            pitches[dim - 2] = rowPitch;
+            pitches[dim - 3] = slicePitch;
+        }
+
+        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(std::size_t) const -> std::size_t
+        {
+            return pitches[0] * Base::extents()[0];
+        }
+
+        template<std::size_t... RecordCoords>
+        LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(
+            typename Base::ArrayIndex ai,
+            RecordCoord<RecordCoords...> = {}) const -> NrAndOffset
+        {
+            constexpr std::size_t flatFieldIndex =
+#ifdef __NVCC__
+                *& // mess with nvcc compiler state to workaround bug
+#endif
+                 Flattener::template flatIndex<RecordCoords...>;
+            const auto offset
+                = dot(pitches, ai) + flatOffsetOf<typename Flattener::FlatRecordDim, flatFieldIndex, AlignAndPad>;
+            return {0, offset};
+        }
+    };
+} // namespace llamaex
+
+auto main() -> int
+try
+{
+    int device = 0;
+    checkError(cudaGetDevice(&device));
+    cudaDeviceProp prop{};
+    checkError(cudaGetDeviceProperties(&prop, device));
+    fmt::print(
+        "Running on {}, {}MiB GM, {}kiB SM\n",
+        prop.name,
+        prop.totalGlobalMem / 1024 / 1024,
+        prop.sharedMemPerBlock / 1024);
+
+    const auto extents = llama::ArrayExtents{600, 800}; // height, width
+    const auto widthBytes = extents[1] * sizeof(RGB);
+
+    const auto blockDim = dim3{16, 32, 1};
+    const auto gridDim = dim3{
+        llama::divCeil(static_cast<unsigned>(extents[1]), blockDim.x),
+        llama::divCeil(static_cast<unsigned>(extents[0]), blockDim.y),
+        1};
+
+    std::vector<RGB> host1(llama::product(extents));
+    std::vector<RGB> host2(llama::product(extents));
+    {
+        std::byte* mem = nullptr;
+        std::size_t rowPitch = 0;
+        checkError(cudaMallocPitch(&mem, &rowPitch, widthBytes, extents[0]));
+        fmt::print("Row pitch: {} B ({} B padding)\n", rowPitch, rowPitch - widthBytes);
+
+        auto mapping = llamaex::PitchedAoS<llama::ArrayExtentsDynamic<2>, RGB>{extents, rowPitch};
+        assert(mapping.blobSize(0) == rowPitch * extents[0]);
+        auto view = llama::View{mapping, llama::Array{mem}};
+
+        init<<<gridDim, blockDim>>>(view, extents);
+
+        checkError(cudaMemcpy2D(host1.data(), widthBytes, mem, rowPitch, widthBytes, extents[0], cudaMemcpyDefault));
+        checkError(cudaFree(mem));
+
+        stbi_write_png("pitch1.png", extents[1], extents[0], 3, host1.data(), 0);
+    }
+
+    // nvcc 11.3 fails to compile the AoS mapping here
+#if !(defined(__NVCC__) && __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 3)
+    {
+        std::byte* mem = nullptr;
+        checkError(cudaMalloc(&mem, widthBytes * extents[0]));
+
+        auto mapping = llama::mapping::AoS{extents, RGB{}};
+        auto view = llama::View{mapping, llama::Array{mem}};
+
+        init<<<gridDim, blockDim>>>(view, extents);
+
+        checkError(cudaMemcpy(host2.data(), mem, widthBytes * extents[0], cudaMemcpyDefault));
+        checkError(cudaFree(mem));
+
+        stbi_write_png("pitch2.png", extents[1], extents[0], 3, host2.data(), 0);
+    }
+#endif
+
+    if(host1 != host2)
+        fmt::print("ERROR: produced two different images");
+
+    return 0;
+}
+catch(const std::exception& e)
+{
+    std::cerr << "Exception: " << e.what() << '\n';
+}