diff --git a/examples/cuda/cudaWriteRead.cu b/examples/cuda/cudaWriteRead.cu index e83754ac7c..433dd58149 100644 --- a/examples/cuda/cudaWriteRead.cu +++ b/examples/cuda/cudaWriteRead.cu @@ -26,6 +26,7 @@ int BPWrite(const std::string fname, const size_t N, int nSteps){ // Set up the ADIOS structures adios2::ADIOS adios; adios2::IO io = adios.DeclareIO("WriteIO"); + io.SetEngine("BP5"); // Declare an array for the ADIOS data of size (NumOfProcesses * N) const adios2::Dims shape{static_cast(N)}; @@ -61,28 +62,23 @@ int BPRead(const std::string fname, const size_t N, int nSteps){ // Create ADIOS structures adios2::ADIOS adios; adios2::IO io = adios.DeclareIO("ReadIO"); + io.SetEngine("BP5"); adios2::Engine bpReader = io.Open(fname, adios2::Mode::Read); - auto data = io.InquireVariable("data"); - std::cout << "Steps expected by the reader: " << bpReader.Steps() << std::endl; - std::cout << "Expecting data per step: " << data.Shape()[0]; - std::cout << " elements" << std::endl; - - int write_step = bpReader.Steps(); - // Create the local buffer and initialize the access point in the ADIOS file - std::vector simData(N); //set size to N - const adios2::Dims start{0}; - const adios2::Dims count{N}; - const adios2::Box sel(start, count); - data.SetSelection(sel); - - // Read the data in each of the ADIOS steps - for (size_t step = 0; step < write_step; step++) + unsigned int step = 0; + for (; bpReader.BeginStep() == adios2::StepStatus::OK; ++step) { - data.SetStepSelection({step, 1}); + auto data = io.InquireVariable("data"); + // Create the local buffer and initialize the access point in the ADIOS file + std::vector simData(N); //set size to N + const adios2::Dims start{0}; + const adios2::Dims count{N}; + const adios2::Box sel(start, count); + data.SetSelection(sel); + bpReader.Get(data, simData.data()); - bpReader.PerformGets(); + bpReader.EndStep(); std::cout << "Simualation step " << step << " : "; std::cout << simData.size() << " elements: " << simData[1] << std::endl; } diff --git a/source/adios2/engine/bp5/BP5Writer.tcc b/source/adios2/engine/bp5/BP5Writer.tcc index af19b388d8..d563781fde 100644 --- a/source/adios2/engine/bp5/BP5Writer.tcc +++ b/source/adios2/engine/bp5/BP5Writer.tcc @@ -26,6 +26,10 @@ void BP5Writer::PutCommon(Variable &variable, const T *values, bool sync) BeginStep(StepMode::Update); } variable.SetData(values); + // if the user buffer is allocated on the GPU always use sync mode + bool isCudaBuffer = (variable.m_MemorySpace == MemorySpace::CUDA); + if (isCudaBuffer) + sync = true; size_t *Shape = NULL; size_t *Start = NULL; diff --git a/source/adios2/helper/adiosMemory.h b/source/adios2/helper/adiosMemory.h index a67abc904f..d197876d61 100644 --- a/source/adios2/helper/adiosMemory.h +++ b/source/adios2/helper/adiosMemory.h @@ -46,6 +46,9 @@ void InsertToBuffer(std::vector &buffer, const T *source, template void CopyFromGPUToBuffer(std::vector &buffer, size_t &position, const T *source, const size_t elements = 1) noexcept; +template +void CudaMemCopyToBuffer(char *buffer, size_t position, const T *source, + const size_t size) noexcept; /** * Wrapper around cudaMemcpy needed for isolating CUDA interface dependency diff --git a/source/adios2/helper/adiosMemory.inl b/source/adios2/helper/adiosMemory.inl index cfc15ef971..efdb878be0 100644 --- a/source/adios2/helper/adiosMemory.inl +++ b/source/adios2/helper/adiosMemory.inl @@ -79,10 +79,17 @@ template void CopyFromGPUToBuffer(std::vector &buffer, size_t &position, const T *source, const size_t elements) noexcept { - const char *src = reinterpret_cast(source); - MemcpyGPUToBuffer(buffer.data() + position, src, elements * sizeof(T)); + CudaMemCopyToBuffer(buffer.data(), position, source, elements * sizeof(T)); position += elements * sizeof(T); } + +template +void CudaMemCopyToBuffer(char *buffer, size_t position, + const T *source, const size_t size) noexcept +{ + const char *src = reinterpret_cast(source); + MemcpyGPUToBuffer(buffer + position, src, size); +} #endif template diff --git a/source/adios2/toolkit/format/bp5/BP5Serializer.cpp b/source/adios2/toolkit/format/bp5/BP5Serializer.cpp index 019f1a8cea..0902be3bf8 100644 --- a/source/adios2/toolkit/format/bp5/BP5Serializer.cpp +++ b/source/adios2/toolkit/format/bp5/BP5Serializer.cpp @@ -570,15 +570,27 @@ void BP5Serializer::DumpDeferredBlocks(bool forceCopyDeferred) } static void GetMinMax(const void *Data, size_t ElemCount, const DataType Type, - core::Engine::MinMaxStruct &MinMax) + core::Engine::MinMaxStruct &MinMax, MemorySpace MemSpace) { - MinMax.Init(Type); if (ElemCount == 0) return; if (Type == DataType::Compound) { } +#ifdef ADIOS2_HAVE_CUDA +#define pertype(T, N) \ + else if (MemSpace == MemorySpace::CUDA && \ + Type == helper::GetDataType()) \ + { \ + const size_t size = ElemCount * sizeof(T); \ + const T *values = (const T *)Data; \ + helper::CUDAMinMax(values, ElemCount, MinMax.MinUnion.field_##N, \ + MinMax.MaxUnion.field_##N); \ + } + ADIOS2_FOREACH_MINMAX_STDTYPE_2ARGS(pertype) +#undef pertype +#endif #define pertype(T, N) \ else if (Type == helper::GetDataType()) \ { \ @@ -669,7 +681,8 @@ void BP5Serializer::Marshal(void *Variable, const char *Name, MinMax.Init(Type); if ((m_StatsLevel > 0) && !Span) { - GetMinMax(Data, ElemCount, (DataType)Rec->Type, MinMax); + GetMinMax(Data, ElemCount, (DataType)Rec->Type, MinMax, + VB->m_MemorySpace); } if (Rec->OperatorType) @@ -698,9 +711,10 @@ void BP5Serializer::Marshal(void *Variable, const char *Name, { if (!DeferAddToVec) { - DataOffset = m_PriorDataBufferSizeTotal + - CurDataBuffer->AddToVec(ElemCount * ElemSize, Data, - ElemSize, Sync); + DataOffset = + m_PriorDataBufferSizeTotal + + CurDataBuffer->AddToVec(ElemCount * ElemSize, Data, + ElemSize, Sync, VB->m_MemorySpace); } } else diff --git a/source/adios2/toolkit/format/buffer/BufferV.h b/source/adios2/toolkit/format/buffer/BufferV.h index b55a9dabba..be1d28be80 100644 --- a/source/adios2/toolkit/format/buffer/BufferV.h +++ b/source/adios2/toolkit/format/buffer/BufferV.h @@ -41,7 +41,8 @@ class BufferV virtual void Reset(); virtual size_t AddToVec(const size_t size, const void *buf, size_t align, - bool CopyReqd) = 0; + bool CopyReqd, + MemorySpace MemSpace = MemorySpace::Host) = 0; struct BufferPos { diff --git a/source/adios2/toolkit/format/buffer/chunk/ChunkV.cpp b/source/adios2/toolkit/format/buffer/chunk/ChunkV.cpp index bf1e3a0131..94b14adaaf 100644 --- a/source/adios2/toolkit/format/buffer/chunk/ChunkV.cpp +++ b/source/adios2/toolkit/format/buffer/chunk/ChunkV.cpp @@ -7,6 +7,7 @@ */ #include "ChunkV.h" +#include "adios2/helper/adiosFunctions.h" #include "adios2/toolkit/format/buffer/BufferV.h" #include @@ -82,7 +83,7 @@ void ChunkV::CopyExternalToInternal() } size_t ChunkV::AddToVec(const size_t size, const void *buf, size_t align, - bool CopyReqd) + bool CopyReqd, MemorySpace MemSpace) { if (size == 0) { @@ -120,7 +121,7 @@ size_t ChunkV::AddToVec(const size_t size, const void *buf, size_t align, if (AppendPossible) { // We can use current chunk, just append the data; - memcpy(m_TailChunk + m_TailChunkPos, buf, size); + CopyDataToBuffer(size, buf, m_TailChunkPos, MemSpace); DataV.back().Size += size; m_TailChunkPos += size; } @@ -132,7 +133,7 @@ size_t ChunkV::AddToVec(const size_t size, const void *buf, size_t align, NewSize = size; m_TailChunk = (char *)malloc(NewSize); m_Chunks.push_back(m_TailChunk); - memcpy(m_TailChunk, buf, size); + CopyDataToBuffer(size, buf, 0, MemSpace); m_TailChunkPos = size; VecEntry entry = {false, m_TailChunk, 0, size}; DataV.push_back(entry); @@ -142,6 +143,19 @@ size_t ChunkV::AddToVec(const size_t size, const void *buf, size_t align, return retOffset; } +void ChunkV::CopyDataToBuffer(const size_t size, const void *buf, size_t pos, + MemorySpace MemSpace) +{ +#ifdef ADIOS2_HAVE_CUDA + if (MemSpace == MemorySpace::CUDA) + { + helper::CudaMemCopyToBuffer(m_TailChunk, pos, buf, size); + return; + } +#endif + memcpy(m_TailChunk + pos, buf, size); +} + BufferV::BufferPos ChunkV::Allocate(const size_t size, size_t align) { if (size == 0) diff --git a/source/adios2/toolkit/format/buffer/chunk/ChunkV.h b/source/adios2/toolkit/format/buffer/chunk/ChunkV.h index 4db67aed8a..4642108c5f 100644 --- a/source/adios2/toolkit/format/buffer/chunk/ChunkV.h +++ b/source/adios2/toolkit/format/buffer/chunk/ChunkV.h @@ -32,7 +32,8 @@ class ChunkV : public BufferV virtual std::vector DataVec() noexcept; virtual size_t AddToVec(const size_t size, const void *buf, size_t align, - bool CopyReqd); + bool CopyReqd, + MemorySpace MemSpace = MemorySpace::Host); virtual BufferPos Allocate(const size_t size, size_t align); virtual void DownsizeLastAlloc(const size_t oldSize, const size_t newSize); @@ -40,6 +41,8 @@ class ChunkV : public BufferV virtual void *GetPtr(int bufferIdx, size_t posInBuffer); void CopyExternalToInternal(); + void CopyDataToBuffer(const size_t size, const void *buf, size_t pos, + MemorySpace MemSpace); private: std::vector m_Chunks; diff --git a/source/adios2/toolkit/format/buffer/malloc/MallocV.cpp b/source/adios2/toolkit/format/buffer/malloc/MallocV.cpp index 7f0015a62f..c23cb1e415 100644 --- a/source/adios2/toolkit/format/buffer/malloc/MallocV.cpp +++ b/source/adios2/toolkit/format/buffer/malloc/MallocV.cpp @@ -85,7 +85,7 @@ void MallocV::CopyExternalToInternal() } size_t MallocV::AddToVec(const size_t size, const void *buf, size_t align, - bool CopyReqd) + bool CopyReqd, MemorySpace MemSpace) { if (size == 0) { diff --git a/source/adios2/toolkit/format/buffer/malloc/MallocV.h b/source/adios2/toolkit/format/buffer/malloc/MallocV.h index ce1d43ccfc..e78b69b945 100644 --- a/source/adios2/toolkit/format/buffer/malloc/MallocV.h +++ b/source/adios2/toolkit/format/buffer/malloc/MallocV.h @@ -36,7 +36,8 @@ class MallocV : public BufferV virtual void Reset(); virtual size_t AddToVec(const size_t size, const void *buf, size_t align, - bool CopyReqd); + bool CopyReqd, + MemorySpace MemSpace = MemorySpace::Host); virtual BufferPos Allocate(const size_t size, size_t align); void DownsizeLastAlloc(const size_t oldSize, const size_t newSize);