Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ORO-0] Update. #66

Merged
merged 2 commits into from
Mar 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions Orochi/GpuMemory.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#pragma once

#include <Orochi/OrochiUtils.h>
#include <utility>

namespace Oro
{

/// @brief A helper function that casts an address of a pointer to the device memory to a void pointer to be used as an argument for kernel calls.
/// @tparam T The type of the element stored in the device memory.
/// @param ptr The address of a pointer to the device memory.
/// @return A void pointer.
template<typename T>
void* arg_cast( T* const* ptr ) noexcept
{
return reinterpret_cast<void*>( const_cast<T**>( ptr ) );
}

template<typename T>
class GpuMemory final
{
public:
GpuMemory() = default;

/// @brief Allocate the device memory with the given size.
/// @param init_size The initial size which represents the number of elements.
explicit GpuMemory( const size_t init_size )
{
OrochiUtils::malloc( m_data, init_size );

m_size = init_size;
m_capacity = init_size;
}

GpuMemory( const GpuMemory& ) = delete;
GpuMemory& operator=( const GpuMemory& other ) = delete;

GpuMemory( GpuMemory&& other ) noexcept : m_data{ std::exchange( other.m_data, nullptr ) }, m_size{ std::exchange( other.m_size, 0ULL ) }, m_capacity{ std::exchange( other.m_capacity, 0ULL ) } {}

GpuMemory& operator=( GpuMemory&& other ) noexcept
{
GpuMemory tmp( std::move( *this ) );

swap( *this, other );

return *this;
}

~GpuMemory()
{
if( m_data )
{
OrochiUtils::free( m_data );
m_data = nullptr;
}
m_size = 0ULL;
m_capacity = 0ULL;
}

/// @brief Get the size of the device memory.
/// @return The size of the device memory.
size_t size() const noexcept { return m_size; }

/// @brief Get the pointer to the device memory.
/// @return The pointer to the device memory.
T* ptr() const noexcept { return m_data; }

/// @brief Get the address of the pointer to the device memory. Useful for passing arguments to the kernel call.
/// @return The address of the pointer to the device memory.
T* const* address() const noexcept { return &m_data; }

/// @brief Resize the device memory. Its capacity is unchanged if the new size is smaller than the current one.
/// The old data should be considered invalid to be used after the function is called unless @c copy is set to True.
/// @param new_size The new memory size after the function is called.
/// @param copy If true, the function will copy the data to the newly created memory space as well.
void resize( const size_t new_size, const bool copy = false ) noexcept
{
if( new_size <= m_capacity )
{
m_size = new_size;
return;
}

GpuMemory tmp( new_size );

if( copy )
{
OrochiUtils::copyDtoD( tmp.m_data, m_data, m_size );
}

*this = std::move( tmp );
}

/// @brief Reset the memory space so that all bits inside are cleared to zero.
void reset() noexcept { OrochiUtils::memset( m_data, 0, m_size * sizeof( T ) ); }

/// @brief Copy the data from device memory to host.
/// @param host_ptr The host pointer.
/// @param host_data_size The size of the host memory which represents the number of elements.
void copyFromHost( const T* host_ptr, const size_t host_data_size ) noexcept
{
resize( host_data_size );
OrochiUtils::copyHtoD( m_data, host_ptr, host_data_size );
}

/// @brief Get the content of the first element stored in the device memory.
/// @return The content of the first element in the device memory.
T getSingle() const noexcept
{
T result{};

OrochiUtils::copyDtoH( &result, m_data, 1ULL );

return result;
}

/// @brief Get all the data stored in the device memory.
/// @return A vector which contains all the data stored in the device memory.
std::vector<T> getData() const noexcept
{
std::vector<T> result{};
result.resize( m_size );

OrochiUtils::copyDtoH( result.data(), m_data, m_size );

return result;
}

private:
static void swap( GpuMemory& lhs, GpuMemory& rhs ) noexcept
{
std::swap( lhs.m_data, rhs.m_data );
std::swap( lhs.m_size, rhs.m_size );
std::swap( lhs.m_capacity, rhs.m_capacity );
}

T* m_data{ nullptr };
size_t m_size{ 0ULL };
size_t m_capacity{ 0ULL };
};

} // namespace Oro
114 changes: 63 additions & 51 deletions Orochi/Orochi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,35 +81,63 @@ int oroInitialize( oroApi api, oroU32 flags )
s_api = api;
int e = 0;
s_loadedApis = 0;
if( (api & ORO_API_CUDA) == ORO_API_CUDA )

if( api & ORO_API_CUDA )
{
e = cuewInit( CUEW_INIT_CUDA | CUEW_INIT_NVRTC );
if( e == 0 )
s_loadedApis |= ORO_API_CUDA | ORO_API_CUDADRIVER | ORO_API_CUDARTC;
}
if ((s_loadedApis & ORO_API_CUDA) == 0) {
if (api & ORO_API_CUDADRIVER)
cuuint32_t flag = 0;
if( api & ORO_API_CUDADRIVER )
{
cuuint32_t cuewInitFlags = CUEW_INIT_CUDA;
if ( api & ORO_API_CUDARTC ) cuewInitFlags |= CUEW_INIT_NVRTC;
e = cuewInit( cuewInitFlags );
if( e == 0 )
{
s_loadedApis |= ORO_API_CUDADRIVER;
if ( api & ORO_API_CUDARTC ) s_loadedApis |= ORO_API_CUDARTC;
}
flag |= CUEW_INIT_CUDA;
}
if( api & ORO_API_CUDARTC )
{
flag |= CUEW_INIT_NVRTC;
}

int resultDriver, resultRtc;
cuewInit( &resultDriver, &resultRtc, flag );

if( resultDriver == CUEW_SUCCESS )
{
s_loadedApis |= ORO_API_CUDADRIVER;
}
if( resultRtc == CUEW_SUCCESS )
{
s_loadedApis |= ORO_API_CUDARTC;
}
}
if( api & ORO_API_HIP )
{
e = hipewInit( HIPEW_INIT_HIP );
if( e == 0 )
s_loadedApis |= ORO_API_HIP;
hipuint32_t flag = 0;
if( api & ORO_API_HIPDRIVER )
{
flag |= HIPEW_INIT_HIPDRIVER;
}
if( api & ORO_API_HIPRTC )
{
flag |= HIPEW_INIT_HIPRTC;
}

int resultDriver, resultRtc;
hipewInit( &resultDriver, &resultRtc, flag );

if( resultDriver == HIPEW_SUCCESS )
{
s_loadedApis |= ORO_API_HIPDRIVER;
}
if( resultRtc == HIPEW_SUCCESS )
{
s_loadedApis |= ORO_API_HIPRTC;
}
}
if( s_loadedApis == 0 )
return ORO_ERROR_OPEN_FAILED;
return ORO_SUCCESS;
}
oroApi oroLoadedAPI()
{
return (oroApi)s_loadedApis;
}
oroApi oroGetCurAPI(oroU32 flags)
{
return s_api;
Expand Down Expand Up @@ -355,6 +383,18 @@ oroError OROAPI oroDeviceGetAttribute(int* pi, oroDeviceAttribute attrib, oroDev
return oroErrorUnknown;
}

oroError OROAPI oroDeviceGetLimit( size_t* pValue, oroLimit limit )
{
__ORO_FUNC1( CtxGetLimit( pValue, (CUlimit)limit ), DeviceGetLimit( pValue, (hipLimit_t)limit ) );
return oroErrorUnknown;
}

oroError OROAPI oroDeviceSetLimit( oroLimit limit, size_t value )
{
__ORO_FUNC1( CtxSetLimit( (CUlimit)limit, value ), DeviceSetLimit( (hipLimit_t)limit, value ) );
return oroErrorUnknown;
}

oroError OROAPI oroDeviceComputeCapability(int* major, int* minor, oroDevice dev)
{
return oroErrorUnknown;
Expand Down Expand Up @@ -468,6 +508,7 @@ oroError OROAPI oroDeviceSynchronize(void)
//oroError OROAPI oroCtxSetCacheConfig(hipFuncCache_t config);
//oroError OROAPI oroCtxGetSharedMemConfig(hipSharedMemConfig* pConfig);
//oroError OROAPI oroCtxSetSharedMemConfig(hipSharedMemConfig config);

oroError OROAPI oroCtxGetApiVersion(oroCtx ctx, unsigned int* version)
{
__ORO_FUNC1( CtxGetApiVersion(*oroCtx2cu(&ctx), version ), CtxGetApiVersion(*oroCtx2hip(&ctx), version ) );
Expand Down Expand Up @@ -658,55 +699,26 @@ oroError OROAPI oroModuleOccupancyMaxPotentialBlockSize( int* minGridSize, int*
}

//-------------------
oroError OROAPI oroImportExternalMemory(oroExternalMemory* extMem_out, const oroExternalMemoryHandleDesc* memHandleDesc)
oroError OROAPI oroImportExternalMemory(oroExternalMemory_t* extMem_out, const oroExternalMemoryHandleDesc* memHandleDesc)
{
__ORO_FUNC1( ImportExternalMemory( (CUexternalMemory*)extMem_out, (const CUDA_EXTERNAL_MEMORY_HANDLE_DESC*)memHandleDesc ),
ImportExternalMemory( (hipExternalMemory_t*)extMem_out, (const hipExternalMemoryHandleDesc*)memHandleDesc ) );
return oroErrorUnknown;
}
//-------------------
oroError OROAPI oroImportExternalSemaphore(oroExternalSemaphore* extSem_out, const oroExternalSemaphoreHandleDesc* semHandleDesc)
{
__ORO_FUNC1(ImportExternalSemaphore((CUexternalSemaphore*)extSem_out, (const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC*)semHandleDesc),
ImportExternalSemaphore((hipExternalSemaphore_t*)extSem_out, (const hipExternalSemaphoreHandleDesc*)semHandleDesc));
return oroErrorUnknown;
}
//-------------------
oroError OROAPI oroExternalMemoryGetMappedBuffer(oroDeviceptr* devPtr, oroExternalMemory extMem, const oroExternalMemoryBufferDesc* bufferDesc)
oroError OROAPI oroExternalMemoryGetMappedBuffer(void **devPtr, oroExternalMemory_t extMem, const oroExternalMemoryBufferDesc* bufferDesc)
{
__ORO_FUNC1( ExternalMemoryGetMappedBuffer( (CUdeviceptr*)devPtr, (CUexternalMemory)extMem, (const CUDA_EXTERNAL_MEMORY_BUFFER_DESC*)bufferDesc ),
ExternalMemoryGetMappedBuffer( (void**)devPtr, (hipExternalMemory_t)extMem, (const hipExternalMemoryBufferDesc*)bufferDesc ) );
ExternalMemoryGetMappedBuffer( devPtr, (hipExternalMemory_t)extMem, (const hipExternalMemoryBufferDesc*)bufferDesc ) );
return oroErrorUnknown;
}
//-------------------
oroError OROAPI oroDestroyExternalMemory(oroExternalMemory extMem)
oroError OROAPI oroDestroyExternalMemory(oroExternalMemory_t extMem)
{
__ORO_FUNC1( DestroyExternalMemory( (CUexternalMemory)extMem ),
DestroyExternalMemory( (hipExternalMemory_t)extMem ) );
return oroErrorUnknown;
}
//-------------------
oroError OROAPI oroDestroyExternalSemaphore(oroExternalSemaphore extSem)
{
__ORO_FUNC1(DestroyExternalSemaphore((CUexternalSemaphore)extSem),
DestroyExternalSemaphore((hipExternalSemaphore_t)extSem));
return oroErrorUnknown;
}
//-------------------
oroError OROAPI oroWaitExternalSemaphoresAsync(const oroExternalSemaphore* extSemArray, const oroExternalSemaphoreWaitParams* paramsArray, unsigned int numExtSems, oroStream stream)
{
__ORO_FUNC1(WaitExternalSemaphoresAsync((CUexternalSemaphore*)extSemArray, (const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS*)paramsArray, numExtSems, (CUstream)stream),
WaitExternalSemaphoresAsync((hipExternalSemaphore_t*)extSemArray, (const hipExternalSemaphoreWaitParams*)paramsArray, numExtSems, (hipStream_t)stream));
return oroErrorUnknown;
}
//-------------------
oroError OROAPI oroSignalExternalSemaphoresAsync(const oroExternalSemaphore* extSemArray, const oroExternalSemaphoreSignalParams* paramsArray, unsigned int numExtSems, oroStream stream)
{
__ORO_FUNC1(SignalExternalSemaphoresAsync((CUexternalSemaphore*)extSemArray, (const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS*)paramsArray, numExtSems, (CUstream)stream),
SignalExternalSemaphoresAsync((hipExternalSemaphore_t*)extSemArray, (const hipExternalSemaphoreSignalParams*)paramsArray, numExtSems, (hipStream_t)stream));
return oroErrorUnknown;
}

/* oroError OROAPI oroGetLastError(oroError oro_error)
{
__ORO_FUNC2(GetLastError((cudaError_t)oro_error),
Expand Down
Loading