Skip to content

Commit

Permalink
add new CUDA demo for pitched allocation
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Mar 1, 2022
1 parent c4b94b6 commit 7443349
Show file tree
Hide file tree
Showing 4 changed files with 207 additions and 1 deletion.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,9 @@ if (LLAMA_BUILD_EXAMPLES)
check_language(CUDA)
if (CMAKE_CUDA_COMPILER)
enable_language(CUDA)
set(CMAKE_CUDA_ARCHITECTURES "35" CACHE STRING "CUDA architectures to compile for")
add_subdirectory("examples/cuda/nbody")
add_subdirectory("examples/cuda/pitch")
elseif()
message(WARNING "Could not find CUDA. Try setting CMAKE_CUDA_COMPILER. CUDA examples are disabled.")
endif()
Expand Down
1 change: 0 additions & 1 deletion examples/cuda/nbody/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
cmake_minimum_required (VERSION 3.18.3)
project(llama-cuda-nbody CXX CUDA)
set(CMAKE_CUDA_ARCHITECTURES "35" CACHE STRING "CUDA architectures to compile for")

find_package(CUDAToolkit) # for include directories
find_package(fmt CONFIG REQUIRED)
Expand Down
13 changes: 13 additions & 0 deletions examples/cuda/pitch/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
cmake_minimum_required (VERSION 3.18.3)
project(llama-cuda-pitch CXX CUDA)

find_package(CUDAToolkit) # for include directories
find_package(fmt CONFIG REQUIRED)
if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()
add_executable(${PROJECT_NAME} pitch.cu)
target_compile_features(${PROJECT_NAME} PRIVATE cuda_std_17)
target_compile_options(${PROJECT_NAME} PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda --expt-relaxed-constexpr>)
target_include_directories(${PROJECT_NAME} SYSTEM PRIVATE ../../../thirdparty/stb/include)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama CUDA::cudart fmt::fmt)
192 changes: 192 additions & 0 deletions examples/cuda/pitch/pitch.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
#define STB_IMAGE_WRITE_IMPLEMENTATION

#include <cstddef>
#include <cuda_runtime.h>
#include <fmt/format.h>
#include <llama/llama.hpp>
#include <stb_image_write.h>
#include <stdexcept>

using namespace std::literals;

struct RGB
{
unsigned char r, g, b;

friend auto operator==(RGB a, RGB b) -> bool
{
return a.r == b.r && a.g == b.g && a.b == b.b;
}
};

void checkError(cudaError_t code)
{
if(code != cudaSuccess)
throw std::runtime_error("CUDA Error: "s + cudaGetErrorString(code));
}

template<typename View, typename ArrayExtents>
__global__ void init(View view, ArrayExtents extents)
{
const auto x = blockIdx.x * blockDim.x + threadIdx.x;
const auto y = blockIdx.y * blockDim.y + threadIdx.y;
if(y >= extents[0] || x >= extents[1])
return;

view(y, x).r = x * 255 / static_cast<float>(blockDim.x * gridDim.x);
view(y, x).g = y * 255 / static_cast<float>(blockDim.y * gridDim.y);
view(y, x).b = (threadIdx.x + threadIdx.y) * 255 / static_cast<float>(blockDim.x + blockDim.y);
}

namespace llamaex
{
using namespace llama;

template<typename RecordDim, typename ArrayExtents>
LLAMA_FN_HOST_ACC_INLINE constexpr auto pitchesFromExtents(ArrayExtents extents)
{
constexpr std::size_t dim = ArrayExtents{}.size();
Array<std::size_t, dim> pitches{};
pitches[dim - 1] = sizeOf<RecordDim>;
for(auto i = dim - 1; i > 0; --i)
pitches[i - 1] = pitches[i] * extents[i - 1];
return pitches;
}

template<
typename TArrayExtents,
typename TRecordDim,
bool AlignAndPad = true,
template<typename> typename FlattenRecordDim = mapping::FlattenRecordDimInOrder>
struct PitchedAoS : mapping::MappingBase<TArrayExtents, TRecordDim>
{
private:
static constexpr std::size_t dim = TArrayExtents{}.size();

using Base = mapping::MappingBase<TArrayExtents, TRecordDim>;
using Flattener = FlattenRecordDim<TRecordDim>;

Array<std::size_t, dim> pitches;

public:
static constexpr std::size_t blobCount = 1;

LLAMA_FN_HOST_ACC_INLINE constexpr PitchedAoS(TArrayExtents extents, Array<std::size_t, dim> pitches)
: Base(extents)
, pitches(pitches)
{
}

LLAMA_FN_HOST_ACC_INLINE constexpr PitchedAoS(TArrayExtents extents, std::size_t rowPitch)
: Base(extents)
, pitches(pitchesFromExtents<TRecordDim>(extents))
{
static_assert(dim >= 2, "The rowPitch constructor is only available for 2D or higher dimensions");
pitches[dim - 2] = rowPitch;
}

LLAMA_FN_HOST_ACC_INLINE constexpr PitchedAoS(
TArrayExtents extents,
std::size_t rowPitch,
std::size_t slicePitch)
: Base(extents)
, pitches(pitchesFromExtents<TRecordDim>(extents))
{
static_assert(
dim >= 3,
"The rowPitch/slicePitch constructor is only available for 3D or higher dimensions");
pitches[dim - 2] = rowPitch;
pitches[dim - 3] = slicePitch;
}

LLAMA_FN_HOST_ACC_INLINE constexpr auto blobSize(std::size_t) const -> std::size_t
{
return pitches[0] * Base::extents()[0];
}

template<std::size_t... RecordCoords>
LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(
typename Base::ArrayIndex ai,
RecordCoord<RecordCoords...> = {}) const -> NrAndOffset
{
constexpr std::size_t flatFieldIndex =
#ifdef __NVCC__
*& // mess with nvcc compiler state to workaround bug
#endif
Flattener::template flatIndex<RecordCoords...>;
const auto offset
= dot(pitches, ai) + flatOffsetOf<typename Flattener::FlatRecordDim, flatFieldIndex, AlignAndPad>;
return {0, offset};
}
};
} // namespace llamaex

auto main() -> int
try
{
int device = 0;
checkError(cudaGetDevice(&device));
cudaDeviceProp prop{};
checkError(cudaGetDeviceProperties(&prop, device));
fmt::print(
"Running on {}, {}MiB GM, {}kiB SM\n",
prop.name,
prop.totalGlobalMem / 1024 / 1024,
prop.sharedMemPerBlock / 1024);

const auto extents = llama::ArrayExtents{600, 800}; // height, width
const auto widthBytes = extents[1] * sizeof(RGB);

const auto blockDim = dim3{16, 32, 1};
const auto gridDim = dim3{
llama::divCeil(static_cast<unsigned>(extents[1]), blockDim.x),
llama::divCeil(static_cast<unsigned>(extents[0]), blockDim.y),
1};

std::vector<RGB> host1(llama::product(extents));
std::vector<RGB> host2(llama::product(extents));
{
std::byte* mem = nullptr;
std::size_t rowPitch = 0;
checkError(cudaMallocPitch(&mem, &rowPitch, widthBytes, extents[0]));
fmt::print("Row pitch: {} B ({} B padding)\n", rowPitch, rowPitch - widthBytes);

auto mapping = llamaex::PitchedAoS<llama::ArrayExtentsDynamic<2>, RGB>{extents, rowPitch};
assert(mapping.blobSize(0) == rowPitch * extents[0]);
auto view = llama::View{mapping, llama::Array{mem}};

init<<<gridDim, blockDim>>>(view, extents);

checkError(cudaMemcpy2D(host1.data(), widthBytes, mem, rowPitch, widthBytes, extents[0], cudaMemcpyDefault));
checkError(cudaFree(mem));

stbi_write_png("pitch1.png", extents[1], extents[0], 3, host1.data(), 0);
}

// nvcc 11.3 fails to compile the AoS mapping here
#if !(defined(__NVCC__) && __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ == 3)
{
std::byte* mem = nullptr;
checkError(cudaMalloc(&mem, widthBytes * extents[0]));

auto mapping = llama::mapping::AoS{extents, RGB{}};
auto view = llama::View{mapping, llama::Array{mem}};

init<<<gridDim, blockDim>>>(view, extents);

checkError(cudaMemcpy(host2.data(), mem, widthBytes * extents[0], cudaMemcpyDefault));
checkError(cudaFree(mem));

stbi_write_png("pitch2.png", extents[1], extents[0], 3, host2.data(), 0);
}
#endif

if(host1 != host2)
fmt::print("ERROR: produced two different images");

return 0;
}
catch(const std::exception& e)
{
std::cerr << "Exception: " << e.what() << '\n';
}

0 comments on commit 7443349

Please sign in to comment.