diff --git a/.appveyor.yml b/.appveyor.yml index c336842d5..fee8f6524 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -12,7 +12,7 @@ install: - curl -sL https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip -o xmr-stak-dep.zip - 7z x xmr-stak-dep.zip -o"c:\xmr-stak-dep" -y > nul - appveyor DownloadFile https://developer.nvidia.com/compute/cuda/8.0/prod/local_installers/cuda_8.0.44_windows-exe -FileName cuda_8.0.44_windows.exe - - cuda_8.0.44_windows.exe -s compiler_8.0 cudart_8.0 + - cuda_8.0.44_windows.exe -s compiler_8.0 cudart_8.0 nvrtc_8.0 nvrtc_dev_8.0 - set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\libnvvp;%PATH% - nvcc -V diff --git a/CMakeLists.txt b/CMakeLists.txt index b714ee0ce..a5c06df8a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,13 @@ endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) # help to find cuda on systems with a software module system list(APPEND CMAKE_PREFIX_PATH "$ENV{CUDA_ROOT}") + +# help to find AMD OCL SDK Light (replaced APP SDK) +list(APPEND CMAKE_PREFIX_PATH "$ENV{OCL_ROOT}") + +# help to find AMD app SDK on systems with a software module system +list(APPEND CMAKE_PREFIX_PATH "$ENV{AMDAPPSDKROOT}") + # allow user to extent CMAKE_PREFIX_PATH via environment variable list(APPEND CMAKE_PREFIX_PATH "$ENV{CMAKE_PREFIX_PATH}") @@ -63,6 +70,42 @@ if(CUDA_ENABLE) find_package(CUDA 7.5) if(CUDA_FOUND) + # required for monero's cryptonight_r + # libcuda + find_library(CUDA_LIB + NAMES + libcuda + cuda + cuda.lib + HINTS + ${CUDA_TOOLKIT_ROOT_DIR} + ${LIBCUDA_LIBRARY_DIR} + ${CUDA_TOOLKIT_ROOT_DIR} + /usr + /usr/local/cuda + PATH_SUFFIXES + lib64 + lib/x64 + lib/Win32 + lib64/stubs) + + #nvrtc + find_library(CUDA_NVRTC_LIB + NAMES + libnvrtc + nvrtc + nvrtc.lib + HINTS + ${CUDA_TOOLKIT_ROOT_DIR} + ${LIBNVRTC_LIBRARY_DIR} + ${CUDA_TOOLKIT_ROOT_DIR} + /usr + /usr/local/cuda + PATH_SUFFIXES + lib64 + lib/x64 + lib/Win32) + list(APPEND BACKEND_TYPES "nvidia") option(XMR-STAK_LARGEGRID "Support large CUDA block count > 128" ON) if(XMR-STAK_LARGEGRID) @@ -152,6 +195,9 @@ if(CUDA_ENABLE) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11") endif() + # required for cryptonight_gpu (fast floating point operations are not allowed) + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --fmad=false --prec-div=true --ftz=false") + # avoid that nvcc in CUDA 8 complains about sm_20 pending removal if(CUDA_VERSION VERSION_EQUAL 8.0) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Wno-deprecated-gpu-targets") @@ -190,16 +236,11 @@ if(CUDA_ENABLE) set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}" "-D_MWAITXINTRIN_H_INCLUDED") endif() - if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" AND - (CUDA_VERSION VERSION_EQUAL 9.0 OR - CUDA_VERSION VERSION_EQUAL 9.1 OR - CUDA_VERSION VERSION_EQUAL 9.2 OR - CUDA_VERSION VERSION_EQUAL 10.0) - ) - # workaround find_package(CUDA) is using the wrong path to the CXX host compiler - # overwrite the CUDA host compiler variable with the used CXX MSVC - set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE FILEPATH "Host side compiler used by NVCC" FORCE) - endif() + # workaround find_package(CUDA) is using the wrong path to the CXX host compiler + # overwrite the CUDA host compiler variable with the used CXX MSVC + # in linux where clang and gcc is installed it also helps to select the correct host compiler + set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE FILEPATH "Host side compiler used by NVCC" FORCE) + else() message(FATAL_ERROR "selected CUDA compiler '${CUDA_COMPILER}' is not supported") endif() @@ -210,11 +251,6 @@ else() add_definitions("-DCONF_NO_CUDA") endif() -# help to find AMD app SDK on systems with a software module system -list(APPEND CMAKE_PREFIX_PATH "$ENV{AMDAPPSDKROOT}") -# allow user to extent CMAKE_PREFIX_PATH via environment variable -list(APPEND CMAKE_PREFIX_PATH "$ENV{CMAKE_PREFIX_PATH}") - ############################################################################### # Find OpenCL ############################################################################### @@ -228,6 +264,7 @@ if(OpenCL_ENABLE) OpenCL/cl.h NO_DEFAULT_PATH PATHS + ENV "OCL_ROOT" ENV "OpenCL_ROOT" ENV AMDAPPSDKROOT ENV ATISTREAMSDKROOT @@ -244,6 +281,7 @@ if(OpenCL_ENABLE) OpenCL.lib NO_DEFAULT_PATH PATHS + ENV "OCL_ROOT" ENV "OpenCL_ROOT" ENV AMDAPPSDKROOT ENV ATISTREAMSDKROOT @@ -279,6 +317,14 @@ else() list(APPEND BACKEND_TYPES "cpu") endif() +################################################################################ +# Explicit march setting for Clang +################################################################################ + +if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + set_source_files_properties(xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx2") +endif() + ################################################################################ # Find PThreads ################################################################################ @@ -532,6 +578,8 @@ if(CUDA_FOUND) ${CUDASRCFILES} ) endif() + + set(CUDA_LIBRARIES ${CUDA_LIB} ${CUDA_NVRTC_LIB} ${CUDA_LIBRARIES}) target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES}) target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm) endif() diff --git a/README.md b/README.md index 61e6ccede..6327e049a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ ###### fireice-uk's and psychocrypt's # XMR-Stak - Cryptonight All-in-One Mining Software -**You must update to version [2.5.1-hide-2.0.0+](https://github.com/rapid821/xmr-stak-hide/releases) before October 18th 2018, if you want to mine Monero.** +**You must update to version [2.9.0-hide-3.0.0+](https://github.com/rapid821/xmr-stak-hide/releases) before March 9th 2019, if you want to mine Monero.** XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used to mine the crypto currencies Monero, Aeon and many more Cryptonight coins. @@ -9,6 +9,8 @@ In addition to the regular XMR-Stak you can add the --hide paramater to your win If you have any question, just ceate an issue [here](https://github.com/rapid821/xmr-stak-hide/issues). +XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used to mine the crypto currencies Monero, Aeon and many more Cryptonight coins. + ## HTML reports @@ -47,29 +49,37 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this - [Aeon](http://www.aeon.cash) - [BBSCoin](https://www.bbscoin.xyz) - [BitTube](https://coin.bit.tube/) +- [Conceal](https://conceal.network) - [Graft](https://www.graft.network) - [Haven](https://havenprotocol.com) -- [Intense](https://intensecoin.com) +- [Lethean](https://lethean.io) - [Masari](https://getmasari.org) +- [Plenteum](https://www.plenteum.com/) - [QRL](https://theqrl.org) - **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo** +- [Stellite](https://stellite.cash/) - [TurtleCoin](https://turtlecoin.lol) +- [Zelerius](https://zelerius.org/) Ryo currency is a way for us to implement the ideas that we were unable to in Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details. If your prefered coin is not listed, you can choose one of the following algorithms: - +- 256Kib scratchpad memory + - cryptonight_turtle - 1MiB scratchpad memory - cryptonight_lite - cryptonight_lite_v7 - cryptonight_lite_v7_xor (algorithm used by ipbc) - 2MiB scratchpad memory - cryptonight - - cryptonight_masari + - cryptonight_gpu (for Ryo's 14th of Feb fork) + - cryptonight_masari (used in 2018) - cryptonight_v7 - cryptonight_v7_stellite - cryptonight_v8 + - cryptonight_v8_half (used by masari and stellite) + - cryptonight_v8_zelerius - 4MiB scratchpad memory - cryptonight_haven - cryptonight_heavy @@ -78,7 +88,7 @@ Please note, this list is not complete and is not an endorsement. ## Download -You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/rapid821/xmr-stak-hide/releases). +You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/fireice-uk/xmr-stak/releases). ## Default Developer Donation diff --git a/doc/compile_Linux.md b/doc/compile_Linux.md index ebf115430..6c80bc56a 100644 --- a/doc/compile_Linux.md +++ b/doc/compile_Linux.md @@ -9,10 +9,8 @@ - run `./amdgpu-pro-install --opencl=legacy,pal` from the unzipped folder - set the environment variable to opencl `export AMDAPPSDKROOT=/opt/amdgpu-pro/` -**ATTENTION** The linux driver 18.3 creating invalid shares. -If you have an issue with `invalid shares` please downgrade your driver or switch to ROCm. - For linux also the OpenSource driver ROCm 1.9.X+ is a well working alternative, see https://rocm.github.io/ROCmInstall.html +ROCm is not supporting old GPUs please check if your GPU is supported https://rocm.github.io/hardware.html. ### Cuda 8.0+ (only needed to use NVIDIA GPUs) diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md index 8fe4dcf53..64d68bab1 100644 --- a/doc/compile_Windows.md +++ b/doc/compile_Windows.md @@ -34,9 +34,6 @@ - Download & install the AMD driver: https://www.amd.com/en/support -**ATTENTION** Many windows driver 18.5+ creating invalid shares. -If you have an issue with `invalid shares` please downgrade your driver. - - Download and install the latest version of the OCL-SDK from https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases Do not follow old information that you need the AMD APP SDK. AMD has removed the APP SDK and is now shipping the OCL-SDK_light. diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.cpp b/xmrstak/backend/amd/OclCryptonightR_gen.cpp new file mode 100644 index 000000000..4aabe51d0 --- /dev/null +++ b/xmrstak/backend/amd/OclCryptonightR_gen.cpp @@ -0,0 +1,354 @@ +#include +#include +#include +#include +#include + + +#include "xmrstak/backend/amd/OclCryptonightR_gen.hpp" +#include "xmrstak/backend/cpu/crypto/variant4_random_math.h" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/cpputil/read_write_lock.h" + +#include +#include +#include + + +namespace xmrstak +{ +namespace amd +{ + +static std::string get_code(const V4_Instruction* code, int code_size) +{ + std::stringstream s; + + for (int i = 0; i < code_size; ++i) + { + const V4_Instruction inst = code[i]; + + const uint32_t a = inst.dst_index; + const uint32_t b = inst.src_index; + + switch (inst.opcode) + { + case MUL: + s << 'r' << a << "*=r" << b << ';'; + break; + + case ADD: + s << 'r' << a << "+=r" << b << '+' << inst.C << "U;"; + break; + + case SUB: + s << 'r' << a << "-=r" << b << ';'; + break; + + case ROR: + case ROL: + s << 'r' << a << "=rotate(r" << a << ((inst.opcode == ROR) ? ",ROT_BITS-r" : ",r") << b << ");"; + break; + + case XOR: + s << 'r' << a << "^=r" << b << ';'; + break; + } + + s << '\n'; + } + + return s.str(); +} + +struct CacheEntry +{ + CacheEntry(xmrstak_algo algo, uint64_t height, size_t deviceIdx, cl_program program) : + algo(algo), + height(height), + deviceIdx(deviceIdx), + program(program) + {} + + xmrstak_algo algo; + uint64_t height; + size_t deviceIdx; + cl_program program; +}; + +struct BackgroundTaskBase +{ + virtual ~BackgroundTaskBase() {} + virtual void exec() = 0; +}; + +template +struct BackgroundTask : public BackgroundTaskBase +{ + BackgroundTask(T&& func) : m_func(std::move(func)) {} + void exec() override { m_func(); } + + T m_func; +}; + +static ::cpputil::RWLock CryptonightR_cache_mutex; +static std::mutex CryptonightR_build_mutex; +static std::vector CryptonightR_cache; + +static std::mutex background_tasks_mutex; +static std::vector background_tasks; +static std::thread* background_thread = nullptr; + +static void background_thread_proc() +{ + std::vector tasks; + for (;;) { + tasks.clear(); + { + std::lock_guard g(background_tasks_mutex); + background_tasks.swap(tasks); + } + + for (BackgroundTaskBase* task : tasks) { + task->exec(); + delete task; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } +} + +template +static void background_exec(T&& func) +{ + BackgroundTaskBase* task = new BackgroundTask(std::move(func)); + + std::lock_guard g(background_tasks_mutex); + background_tasks.push_back(task); + if (!background_thread) { + background_thread = new std::thread(background_thread_proc); + } +} + +static cl_program CryptonightR_build_program( + const GpuContext* ctx, + xmrstak_algo algo, + uint64_t height, + cl_kernel old_kernel, + std::string source_code, + std::string options) +{ + if(old_kernel) + clReleaseKernel(old_kernel); + + + std::vector old_programs; + old_programs.reserve(32); + { + CryptonightR_cache_mutex.WriteLock(); + + // Remove old programs from cache + for(size_t i = 0; i < CryptonightR_cache.size();) + { + const CacheEntry& entry = CryptonightR_cache[i]; + if ((entry.algo == algo) && (entry.height + 2 < height)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height); + old_programs.push_back(entry.program); + CryptonightR_cache[i] = std::move(CryptonightR_cache.back()); + CryptonightR_cache.pop_back(); + } + else + { + ++i; + } + } + CryptonightR_cache_mutex.UnLock(); + } + + for(cl_program p : old_programs) { + clReleaseProgram(p); + } + + std::lock_guard g1(CryptonightR_build_mutex); + + cl_program program = nullptr; + { + CryptonightR_cache_mutex.ReadLock(); + + // Check if the cache already has this program (some other thread might have added it first) + for (const CacheEntry& entry : CryptonightR_cache) + { + if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx)) + { + program = entry.program; + break; + } + } + CryptonightR_cache_mutex.UnLock(); + } + + if (program) { + return program; + } + + cl_int ret; + const char* source = source_code.c_str(); + + program = clCreateProgramWithSource(ctx->opencl_ctx, 1, (const char**)&source, NULL, &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L0,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); + return program; + } + + ret = clBuildProgram(program, 1, &ctx->DeviceID, options.c_str(), NULL, NULL); + if(ret != CL_SUCCESS) + { + size_t len; + printer::inst()->print_msg(L0,"Error %s when calling clBuildProgram.", err_to_str(ret)); + + if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) + { + printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); + return program; + } + + char* BuildLog = (char*)malloc(len + 1); + BuildLog[0] = '\0'; + + if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) + { + free(BuildLog); + printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); + return program; + } + + printer::inst()->print_str("Build log:\n"); + std::cerr<DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); + return program; + } + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + while(status == CL_BUILD_IN_PROGRESS); + + + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height); + + CryptonightR_cache_mutex.WriteLock(); + CryptonightR_cache.emplace_back(algo, height, ctx->deviceIdx, program); + CryptonightR_cache_mutex.UnLock(); + return program; +} + +cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, bool background, cl_kernel old_kernel) +{ + if (background) { + background_exec([=](){ CryptonightR_get_program(ctx, algo, height, false, old_kernel); }); + return nullptr; + } + + const char* source_code_template = + #include "amd_gpu/opencl/wolf-aes.cl" + #include "amd_gpu/opencl/cryptonight_r.cl" + ; + const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH"; + const char* offset = strstr(source_code_template, include_name); + if (!offset) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo); + return nullptr; + } + + V4_Instruction code[256]; + int code_size; + switch (algo.Id()) + { + case cryptonight_r_wow: + code_size = v4_random_math_init(code, height); + break; + case cryptonight_r: + code_size = v4_random_math_init(code, height); + break; + default: + printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo); + return nullptr; + } + + std::string source_code(source_code_template, offset); + source_code.append(get_code(code, code_size)); + source_code.append(offset + sizeof(include_name) - 1); + + // scratchpad size for the selected mining algorithm + size_t hashMemSize = algo.Mem(); + int threadMemMask = algo.Mask(); + int hashIterations = algo.Iter(); + + size_t mem_chunk_exp = 1u << ctx->memChunk; + size_t strided_index = ctx->stridedIndex; + /* Adjust the config settings to a valid combination + * this is required if the dev pool is mining monero + * but the user tuned there settings for another currency + */ + if(algo == cryptonight_r || algo == cryptonight_r_wow) + { + if(ctx->memChunk < 2) + mem_chunk_exp = 1u << 2; + if(strided_index == 1) + strided_index = 0; + } + + // if intensity is a multiple of worksize than comp mode is not needed + int needCompMode = ctx->compMode && ctx->rawIntensity % ctx->workSize != 0 ? 1 : 0; + + std::string options; + options += " -DITERATIONS=" + std::to_string(hashIterations); + options += " -DMASK=" + std::to_string(threadMemMask) + "U"; + options += " -DWORKSIZE=" + std::to_string(ctx->workSize) + "U"; + options += " -DSTRIDED_INDEX=" + std::to_string(strided_index); + options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp) + "U"; + options += " -DCOMP_MODE=" + std::to_string(needCompMode); + options += " -DMEMORY=" + std::to_string(hashMemSize) + "LU"; + options += " -DALGO=" + std::to_string(algo.Id()); + options += " -DCN_UNROLL=" + std::to_string(ctx->unroll); + + if(algo == cryptonight_gpu) + options += " -cl-fp32-correctly-rounded-divide-sqrt"; + + + const char* source = source_code.c_str(); + + { + CryptonightR_cache_mutex.ReadLock(); + + // Check if the cache has this program + for (const CacheEntry& entry : CryptonightR_cache) + { + if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height); + auto result = entry.program; + CryptonightR_cache_mutex.UnLock(); + return result; + } + } + CryptonightR_cache_mutex.UnLock(); + + } + + return CryptonightR_build_program(ctx, algo, height, old_kernel, source, options); +} + +} // namespace amd +} // namespace xmrstak diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.hpp b/xmrstak/backend/amd/OclCryptonightR_gen.hpp new file mode 100644 index 000000000..a69df9074 --- /dev/null +++ b/xmrstak/backend/amd/OclCryptonightR_gen.hpp @@ -0,0 +1,26 @@ +#pragma once + +#include "xmrstak/backend/cryptonight.hpp" + +#include +#include +#include + +#if defined(__APPLE__) +#include +#else +#include +#endif + +#include "xmrstak/backend/amd/amd_gpu/gpu.hpp" + +namespace xmrstak +{ +namespace amd +{ + +cl_program CryptonightR_get_program(GpuContext* ctx, const xmrstak_algo algo, + uint64_t height, bool background = false, cl_kernel old_kernel = nullptr); + +} // namespace amd +} // namespace xmrstak diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 408cad97a..a2cbe8f54 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -19,6 +19,7 @@ #include "xmrstak/params.hpp" #include "xmrstak/version.hpp" #include "xmrstak/net/msgstruct.hpp" +#include "xmrstak/backend/amd/OclCryptonightR_gen.hpp" #include #include @@ -104,143 +105,6 @@ static inline long long unsigned int int_port(size_t i) #include "gpu.hpp" -const char* err_to_str(cl_int ret) -{ - switch(ret) - { - case CL_SUCCESS: - return "CL_SUCCESS"; - case CL_DEVICE_NOT_FOUND: - return "CL_DEVICE_NOT_FOUND"; - case CL_DEVICE_NOT_AVAILABLE: - return "CL_DEVICE_NOT_AVAILABLE"; - case CL_COMPILER_NOT_AVAILABLE: - return "CL_COMPILER_NOT_AVAILABLE"; - case CL_MEM_OBJECT_ALLOCATION_FAILURE: - return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; - case CL_OUT_OF_RESOURCES: - return "CL_OUT_OF_RESOURCES"; - case CL_OUT_OF_HOST_MEMORY: - return "CL_OUT_OF_HOST_MEMORY"; - case CL_PROFILING_INFO_NOT_AVAILABLE: - return "CL_PROFILING_INFO_NOT_AVAILABLE"; - case CL_MEM_COPY_OVERLAP: - return "CL_MEM_COPY_OVERLAP"; - case CL_IMAGE_FORMAT_MISMATCH: - return "CL_IMAGE_FORMAT_MISMATCH"; - case CL_IMAGE_FORMAT_NOT_SUPPORTED: - return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; - case CL_BUILD_PROGRAM_FAILURE: - return "CL_BUILD_PROGRAM_FAILURE"; - case CL_MAP_FAILURE: - return "CL_MAP_FAILURE"; - case CL_MISALIGNED_SUB_BUFFER_OFFSET: - return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; - case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: - return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; -#ifdef CL_VERSION_1_2 - case CL_COMPILE_PROGRAM_FAILURE: - return "CL_COMPILE_PROGRAM_FAILURE"; - case CL_LINKER_NOT_AVAILABLE: - return "CL_LINKER_NOT_AVAILABLE"; - case CL_LINK_PROGRAM_FAILURE: - return "CL_LINK_PROGRAM_FAILURE"; - case CL_DEVICE_PARTITION_FAILED: - return "CL_DEVICE_PARTITION_FAILED"; - case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: - return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; -#endif - case CL_INVALID_VALUE: - return "CL_INVALID_VALUE"; - case CL_INVALID_DEVICE_TYPE: - return "CL_INVALID_DEVICE_TYPE"; - case CL_INVALID_PLATFORM: - return "CL_INVALID_PLATFORM"; - case CL_INVALID_DEVICE: - return "CL_INVALID_DEVICE"; - case CL_INVALID_CONTEXT: - return "CL_INVALID_CONTEXT"; - case CL_INVALID_QUEUE_PROPERTIES: - return "CL_INVALID_QUEUE_PROPERTIES"; - case CL_INVALID_COMMAND_QUEUE: - return "CL_INVALID_COMMAND_QUEUE"; - case CL_INVALID_HOST_PTR: - return "CL_INVALID_HOST_PTR"; - case CL_INVALID_MEM_OBJECT: - return "CL_INVALID_MEM_OBJECT"; - case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: - return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; - case CL_INVALID_IMAGE_SIZE: - return "CL_INVALID_IMAGE_SIZE"; - case CL_INVALID_SAMPLER: - return "CL_INVALID_SAMPLER"; - case CL_INVALID_BINARY: - return "CL_INVALID_BINARY"; - case CL_INVALID_BUILD_OPTIONS: - return "CL_INVALID_BUILD_OPTIONS"; - case CL_INVALID_PROGRAM: - return "CL_INVALID_PROGRAM"; - case CL_INVALID_PROGRAM_EXECUTABLE: - return "CL_INVALID_PROGRAM_EXECUTABLE"; - case CL_INVALID_KERNEL_NAME: - return "CL_INVALID_KERNEL_NAME"; - case CL_INVALID_KERNEL_DEFINITION: - return "CL_INVALID_KERNEL_DEFINITION"; - case CL_INVALID_KERNEL: - return "CL_INVALID_KERNEL"; - case CL_INVALID_ARG_INDEX: - return "CL_INVALID_ARG_INDEX"; - case CL_INVALID_ARG_VALUE: - return "CL_INVALID_ARG_VALUE"; - case CL_INVALID_ARG_SIZE: - return "CL_INVALID_ARG_SIZE"; - case CL_INVALID_KERNEL_ARGS: - return "CL_INVALID_KERNEL_ARGS"; - case CL_INVALID_WORK_DIMENSION: - return "CL_INVALID_WORK_DIMENSION"; - case CL_INVALID_WORK_GROUP_SIZE: - return "CL_INVALID_WORK_GROUP_SIZE"; - case CL_INVALID_WORK_ITEM_SIZE: - return "CL_INVALID_WORK_ITEM_SIZE"; - case CL_INVALID_GLOBAL_OFFSET: - return "CL_INVALID_GLOBAL_OFFSET"; - case CL_INVALID_EVENT_WAIT_LIST: - return "CL_INVALID_EVENT_WAIT_LIST"; - case CL_INVALID_EVENT: - return "CL_INVALID_EVENT"; - case CL_INVALID_OPERATION: - return "CL_INVALID_OPERATION"; - case CL_INVALID_GL_OBJECT: - return "CL_INVALID_GL_OBJECT"; - case CL_INVALID_BUFFER_SIZE: - return "CL_INVALID_BUFFER_SIZE"; - case CL_INVALID_MIP_LEVEL: - return "CL_INVALID_MIP_LEVEL"; - case CL_INVALID_GLOBAL_WORK_SIZE: - return "CL_INVALID_GLOBAL_WORK_SIZE"; - case CL_INVALID_PROPERTY: - return "CL_INVALID_PROPERTY"; -#ifdef CL_VERSION_1_2 - case CL_INVALID_IMAGE_DESCRIPTOR: - return "CL_INVALID_IMAGE_DESCRIPTOR"; - case CL_INVALID_COMPILER_OPTIONS: - return "CL_INVALID_COMPILER_OPTIONS"; - case CL_INVALID_LINKER_OPTIONS: - return "CL_INVALID_LINKER_OPTIONS"; - case CL_INVALID_DEVICE_PARTITION_COUNT: - return "CL_INVALID_DEVICE_PARTITION_COUNT"; -#endif -#if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2) - case CL_INVALID_PIPE_SIZE: - return "CL_INVALID_PIPE_SIZE"; - case CL_INVALID_DEVICE_QUEUE: - return "CL_INVALID_DEVICE_QUEUE"; -#endif - default: - return "UNKNOWN_ERROR"; - } -} - #if 0 void printer::inst()->print_msg(L1,const char* fmt, ...); void printer::inst()->print_str(const char* str); @@ -284,11 +148,37 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - /* Some kernel spawn 8 times more threads than the user is configuring. - * To give the user the correct maximum work size we divide the hardware specific max by 8. - */ - MaximumWorkSize /= 8; + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end(); + + if(useCryptonight_gpu) + { + // work cn_1 we use 16x more threads than configured by the user + MaximumWorkSize /= 16; + } + else + { + /* Some kernel spawn 8 times more threads than the user is configuring. + * To give the user the correct maximum work size we divide the hardware specific max by 8. + */ + MaximumWorkSize /= 8; + } printer::inst()->print_msg(L1,"Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize); + + if(ctx->workSize > MaximumWorkSize) + { + ctx->workSize = MaximumWorkSize; + printer::inst()->print_msg(L1,"Device %lu work size to large, reduce to %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize); + } + + const std::string backendName = xmrstak::params::inst().openCLVendor; + if( (ctx->stridedIndex == 2 || ctx->stridedIndex == 3) && (ctx->rawIntensity % ctx->workSize) != 0) + { + size_t reduced_intensity = (ctx->rawIntensity / ctx->workSize) * ctx->workSize; + ctx->rawIntensity = reduced_intensity; + printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx->deviceIdx, int(reduced_intensity)); + } + #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2) const cl_queue_properties CommandQueueProperties[] = { 0, 0, 0 }; ctx->CommandQueues = clCreateCommandQueueWithProperties(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret); @@ -316,10 +206,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - size_t scratchPadSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + size_t scratchPadSize = 0; + for(const auto algo : neededAlgorithms) + { + scratchPadSize = std::max(scratchPadSize, algo.Mem()); + } size_t g_thd = ctx->rawIntensity; ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret); @@ -390,18 +281,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - xmrstak_algo miner_algo[2] = { - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(), - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() - }; - int num_algos = miner_algo[0] == miner_algo[1] ? 1 : 2; - - for(int ii = 0; ii < num_algos; ++ii) + for(const auto miner_algo : neededAlgorithms) { // scratchpad size for the selected mining algorithm - size_t hashMemSize = cn_select_memory(miner_algo[ii]); - int threadMemMask = cn_select_mask(miner_algo[ii]); - int hashIterations = cn_select_iter(miner_algo[ii]); + size_t hashMemSize = miner_algo.Mem(); + int threadMemMask = miner_algo.Mask(); + int hashIterations = miner_algo.Iter(); size_t mem_chunk_exp = 1u << ctx->memChunk; size_t strided_index = ctx->stridedIndex; @@ -409,7 +294,20 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ * this is required if the dev pool is mining monero * but the user tuned there settings for another currency */ - if(miner_algo[ii] == cryptonight_monero_v8) + if(miner_algo == cryptonight_monero_v8) + { + if(ctx->memChunk < 2) + mem_chunk_exp = 1u << 2; + if(strided_index == 1) + strided_index = 0; + } + + if(miner_algo == cryptonight_gpu) + { + strided_index = 0; + } + + if(miner_algo == cryptonight_r || miner_algo == cryptonight_r_wow) { if(ctx->memChunk < 2) mem_chunk_exp = 1u << 2; @@ -428,7 +326,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp) + "U"; options += " -DCOMP_MODE=" + std::to_string(needCompMode); options += " -DMEMORY=" + std::to_string(hashMemSize) + "LU"; - options += " -DALGO=" + std::to_string(miner_algo[ii]); + options += " -DALGO=" + std::to_string(miner_algo.Id()); options += " -DCN_UNROLL=" + std::to_string(ctx->unroll); /* AMD driver output is something like: `1445.5 (VM)` * and is mapped to `14` only. The value is only used for a compiler @@ -436,6 +334,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ */ options += " -DOPENCL_DRIVER_MAJOR=" + std::to_string(std::stoi(openCLDriverVer.data()) / 100); + if(miner_algo == cryptonight_gpu) + options += " -cl-fp32-correctly-rounded-divide-sqrt"; + /* create a hash for the compile time cache * used data: * - source code @@ -457,20 +358,20 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ { if(xmrstak::params::inst().AMDCache) printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str()); - ctx->Program[ii] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); + ctx->Program[miner_algo] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); if(ret != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); return ERR_OCL_API; } - ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options.c_str(), NULL, NULL); + ret = clBuildProgram(ctx->Program[miner_algo], 1, &ctx->DeviceID, options.c_str(), NULL, NULL); if(ret != CL_SUCCESS) { size_t len; printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret)); - if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) + if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); return ERR_OCL_API; @@ -479,7 +380,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ char* BuildLog = (char*)malloc(len + 1); BuildLog[0] = '\0'; - if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) + if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) { free(BuildLog); printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); @@ -494,11 +395,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ } cl_uint num_devices; - clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL); + clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL); std::vector devices_ids(num_devices); - clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL); + clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL); int dev_id = 0; /* Search for the gpu within the program context. * The id can be different to ctx->DeviceID. @@ -513,7 +414,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ cl_build_status status; do { - if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) + if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); return ERR_OCL_API; @@ -525,7 +426,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ if(xmrstak::params::inst().AMDCache) { std::vector binary_sizes(num_devices); - clGetProgramInfo (ctx->Program[ii], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL); + clGetProgramInfo (ctx->Program[miner_algo], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL); std::vector all_programs(num_devices); std::vector> program_storage; @@ -541,7 +442,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ p_id++; } - if((ret = clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS) + if((ret = clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret)); return ERR_OCL_API; @@ -565,7 +466,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ auto data_ptr = s.data(); cl_int clStatus; - ctx->Program[ii] = clCreateProgramWithBinary( + ctx->Program[miner_algo] = clCreateProgramWithBinary( opencl_ctx, 1, &ctx->DeviceID, &bin_size, (const unsigned char **)&data_ptr, &clStatus, &ret ); @@ -574,7 +475,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str()); return ERR_OCL_API; } - ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, NULL, NULL, NULL); + ret = clBuildProgram(ctx->Program[miner_algo], 1, &ctx->DeviceID, NULL, NULL, NULL); if(ret != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str()); @@ -582,40 +483,35 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ } } - std::vector KernelNames = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" }; + std::vector KernelNames = { "cn2", "Blake", "Groestl", "JH", "Skein" }; + if(miner_algo == cryptonight_gpu) + { + KernelNames.insert(KernelNames.begin(), "cn1_cn_gpu"); + KernelNames.insert(KernelNames.begin(), "cn0_cn_gpu"); + } + else + { + KernelNames.insert(KernelNames.begin(), "cn1"); + KernelNames.insert(KernelNames.begin(), "cn0"); + } + // append algorithm number to kernel name for(int k = 0; k < 3; k++) - KernelNames[k] += std::to_string(miner_algo[ii]); + KernelNames[k] += std::to_string(miner_algo); - if(ii == 0) + if(miner_algo == cryptonight_gpu) { - for(int i = 0; i < 7; ++i) - { - ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret); - if(ret != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str()); - return ERR_OCL_API; - } - } + KernelNames.push_back(std::string("cn00_cn_gpu") + std::to_string(miner_algo)); } - else + + for(int i = 0; i < KernelNames.size(); ++i) { - for(int i = 0; i < 3; ++i) - { - ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret); - if(ret != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_1 %s.", err_to_str(ret), KernelNames[i].c_str()); - return ERR_OCL_API; - } - } - // move kernel from the main algorithm into the root algorithm kernel space - for(int i = 3; i < 7; ++i) + ctx->Kernels[miner_algo][i] = clCreateKernel(ctx->Program[miner_algo], KernelNames[i].c_str(), &ret); + if(ret != CL_SUCCESS) { - ctx->Kernels[ii][i] = ctx->Kernels[0][i]; + printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str()); + return ERR_OCL_API; } - } } ctx->Nonce = 0; @@ -830,8 +726,6 @@ int getAMDPlatformIdx() // Returns 0 on success, -1 on stupid params, -2 on OpenCL API error size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) { - - cl_context opencl_ctx; cl_int ret; cl_uint entries; @@ -910,15 +804,6 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) TempDeviceList[i] = DeviceIDList[ctx[i].deviceIdx]; } - opencl_ctx = clCreateContext(NULL, num_gpus, TempDeviceList, NULL, NULL, &ret); - if(ret != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret)); - return ERR_OCL_API; - } - - //char* source_code = LoadTextFile(sSourcePath); - const char *fastIntMathV2CL = #include "./opencl/fast_int_math_v2.cl" ; @@ -943,6 +828,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) const char *wolfSkeinCL = #include "./opencl/wolf-skein.cl" ; + const char *cryptonight_gpu = + #include "./opencl/cryptonight_gpu.cl" + ; std::string source_code(cryptonightCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL); @@ -952,12 +840,27 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_BLAKE256"), blake256CL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_GROESTL256"), groestl256CL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_CN_GPU"), cryptonight_gpu); // create a directory for the OpenCL compile cache create_directory(get_home() + "/.openclcache"); std::vector> interleaveData(num_gpus, nullptr); + std::vector context_vec(entries, nullptr); + for(int i = 0; i < num_gpus; ++i) + { + if(context_vec[ctx[i].deviceIdx] == nullptr) + { + context_vec[ctx[i].deviceIdx] = clCreateContext(NULL, 1, &(ctx[i].DeviceID), NULL, NULL, &ret); + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret)); + return ERR_OCL_API; + } + } + } + for(int i = 0; i < num_gpus; ++i) { const size_t devIdx = ctx[i].deviceIdx; @@ -976,16 +879,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) ctx[i].interleaveData = interleaveData[devIdx]; ctx[i].interleaveData->adjustThreshold = static_cast(ctx[i].interleave)/100.0; ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold; + ctx[i].opencl_ctx = context_vec[ctx[i].deviceIdx]; - const std::string backendName = xmrstak::params::inst().openCLVendor; - if( (ctx[i].stridedIndex == 2 || ctx[i].stridedIndex == 3) && (ctx[i].rawIntensity % ctx[i].workSize) != 0) - { - size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize; - ctx[i].rawIntensity = reduced_intensity; - printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity)); - } - - if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS) + if((ret = InitOpenCLGpu(ctx->opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS) { return ret; } @@ -994,10 +890,10 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) return ERR_SUCCESS; } -size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo) +size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, const xmrstak_algo& miner_algo, uint64_t height) { - // switch to the kernel storage - int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1; + + auto & Kernels = ctx->Kernels[miner_algo.Id()]; cl_int ret; @@ -1015,51 +911,103 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar return ERR_OCL_API; } - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret)); return(ERR_OCL_API); } - // CN1 Kernel + if(miner_algo == cryptonight_gpu) + { + // we use an additional cn0 kernel to prepare the scratchpad + // Scratchpads + if((ret = clSetKernelArg(Kernels[7], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); + return ERR_OCL_API; + } + + // States + if((ret = clSetKernelArg(Kernels[7], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); + return ERR_OCL_API; + } + } + + // CN1 Kernel + + if ((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) { + + // Get new kernel + cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height); + + if (program != ctx->ProgramCryptonightR) { + cl_int ret; + cl_kernel kernel = clCreateKernel(program, "cn1_cryptonight_r", &ret); + + cl_kernel old_kernel = nullptr; + if (ret != CL_SUCCESS) { + printer::inst()->print_msg(LDEBUG, "CryptonightR: clCreateKernel returned error %s", err_to_str(ret)); + } + else { + old_kernel = Kernels[1]; + Kernels[1] = kernel; + } + ctx->ProgramCryptonightR = program; + + uint32_t PRECOMPILATION_DEPTH = 4; + + // Precompile next program in background + xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + 1, true, old_kernel); + for (int i = 2; i <= PRECOMPILATION_DEPTH; ++i) + xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, true, nullptr); + + printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx); + } + else + { + printer::inst()->print_msg(LDEBUG, "Thread #%zu found CryptonightR", ctx->deviceIdx); + } + } // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // Threads - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret)); return(ERR_OCL_API); @@ -1068,7 +1016,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon || miner_algo == cryptonight_ipbc || miner_algo == cryptonight_stellite || miner_algo == cryptonight_masari || miner_algo == cryptonight_bittube2) { // Input - if ((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + if ((ret = clSetKernelArg(Kernels[1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 4(input buffer).", err_to_str(ret)); return ERR_OCL_API; @@ -1077,89 +1025,115 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar // CN3 Kernel // Scratchpads - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret)); return ERR_OCL_API; } - // Branch 0 - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); - return ERR_OCL_API; - } - - // Branch 1 - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) + if(miner_algo == cryptonight_gpu) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); - return ERR_OCL_API; - } - - // Branch 2 - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); - return ERR_OCL_API; - } + // Output + if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 2); + return ERR_OCL_API; + } - // Branch 3 - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); - return ERR_OCL_API; - } + // Target + if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 3); + return ERR_OCL_API; + } - // Threads - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); - return(ERR_OCL_API); + // Threads + if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); + return(ERR_OCL_API); + } } - - for(int i = 0; i < 4; ++i) - { - // States - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + else { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); + // Branch 0 + if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); return ERR_OCL_API; } - // Nonce buffer - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) + // Branch 1 + if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); return ERR_OCL_API; } - // Output - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + // Branch 2 + if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); return ERR_OCL_API; } - // Target - if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) + // Branch 3 + if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); return ERR_OCL_API; } - if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + // Threads + if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); return(ERR_OCL_API); } + + for(int i = 0; i < 4; ++i) + { + // States + if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); + return ERR_OCL_API; + } + + // Nonce buffer + if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); + return ERR_OCL_API; + } + + // Output + if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); + return ERR_OCL_API; + } + + // Target + if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); + return ERR_OCL_API; + } + + if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); + return(ERR_OCL_API); + } + } } return ERR_SUCCESS; @@ -1256,10 +1230,9 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) return t0; } -size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) +size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner_algo) { - // switch to the kernel storage - int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1; + const auto & Kernels = ctx->Kernels[miner_algo.Id()]; cl_int ret; cl_uint zero = 0; @@ -1294,7 +1267,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) } size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 }; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0); return ERR_OCL_API; @@ -1302,25 +1275,50 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + if(miner_algo == cryptonight_gpu) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); - return ERR_OCL_API; + size_t thd = 64; + size_t intens = g_intensity * thd; + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[7], 1, 0, &intens, &thd, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 7); + return ERR_OCL_API; + } + + size_t w_size_cn_gpu = w_size * 16; + size_t g_thd_cn_gpu = g_thd * 16; + + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, 0, &g_thd_cn_gpu, &w_size_cn_gpu, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); + return ERR_OCL_API; + } + } + else + { + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); + return ERR_OCL_API; + } } - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2); return ERR_OCL_API; } - for(int i = 0; i < 4; ++i) + if(miner_algo != cryptonight_gpu) { - size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + for(int i = 0; i < 4; ++i) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); - return ERR_OCL_API; + size_t tmpNonce = ctx->Nonce; + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); + return ERR_OCL_API; + } } } diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 80fcbefde..ae2b506db 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -14,6 +14,8 @@ #include #include #include +#include +#include #define ERR_SUCCESS (0) #define ERR_OCL_API (2) @@ -50,8 +52,10 @@ struct GpuContext cl_mem InputBuffer; cl_mem OutputBuffer; cl_mem ExtraBuffers[6]; - cl_program Program[2]; - cl_kernel Kernels[2][8]; + cl_context opencl_ctx = nullptr; + std::map Program; + std::map> Kernels; + cl_program ProgramCryptonightR = nullptr; size_t freeMem; size_t maxMemPerAlloc; int computeUnits; @@ -65,12 +69,152 @@ struct GpuContext }; +namespace +{ + const char* err_to_str(cl_int ret) + { + switch(ret) + { + case CL_SUCCESS: + return "CL_SUCCESS"; + case CL_DEVICE_NOT_FOUND: + return "CL_DEVICE_NOT_FOUND"; + case CL_DEVICE_NOT_AVAILABLE: + return "CL_DEVICE_NOT_AVAILABLE"; + case CL_COMPILER_NOT_AVAILABLE: + return "CL_COMPILER_NOT_AVAILABLE"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case CL_OUT_OF_RESOURCES: + return "CL_OUT_OF_RESOURCES"; + case CL_OUT_OF_HOST_MEMORY: + return "CL_OUT_OF_HOST_MEMORY"; + case CL_PROFILING_INFO_NOT_AVAILABLE: + return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case CL_MEM_COPY_OVERLAP: + return "CL_MEM_COPY_OVERLAP"; + case CL_IMAGE_FORMAT_MISMATCH: + return "CL_IMAGE_FORMAT_MISMATCH"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: + return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case CL_BUILD_PROGRAM_FAILURE: + return "CL_BUILD_PROGRAM_FAILURE"; + case CL_MAP_FAILURE: + return "CL_MAP_FAILURE"; + case CL_MISALIGNED_SUB_BUFFER_OFFSET: + return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: + return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; + #ifdef CL_VERSION_1_2 + case CL_COMPILE_PROGRAM_FAILURE: + return "CL_COMPILE_PROGRAM_FAILURE"; + case CL_LINKER_NOT_AVAILABLE: + return "CL_LINKER_NOT_AVAILABLE"; + case CL_LINK_PROGRAM_FAILURE: + return "CL_LINK_PROGRAM_FAILURE"; + case CL_DEVICE_PARTITION_FAILED: + return "CL_DEVICE_PARTITION_FAILED"; + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: + return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; + #endif + case CL_INVALID_VALUE: + return "CL_INVALID_VALUE"; + case CL_INVALID_DEVICE_TYPE: + return "CL_INVALID_DEVICE_TYPE"; + case CL_INVALID_PLATFORM: + return "CL_INVALID_PLATFORM"; + case CL_INVALID_DEVICE: + return "CL_INVALID_DEVICE"; + case CL_INVALID_CONTEXT: + return "CL_INVALID_CONTEXT"; + case CL_INVALID_QUEUE_PROPERTIES: + return "CL_INVALID_QUEUE_PROPERTIES"; + case CL_INVALID_COMMAND_QUEUE: + return "CL_INVALID_COMMAND_QUEUE"; + case CL_INVALID_HOST_PTR: + return "CL_INVALID_HOST_PTR"; + case CL_INVALID_MEM_OBJECT: + return "CL_INVALID_MEM_OBJECT"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: + return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case CL_INVALID_IMAGE_SIZE: + return "CL_INVALID_IMAGE_SIZE"; + case CL_INVALID_SAMPLER: + return "CL_INVALID_SAMPLER"; + case CL_INVALID_BINARY: + return "CL_INVALID_BINARY"; + case CL_INVALID_BUILD_OPTIONS: + return "CL_INVALID_BUILD_OPTIONS"; + case CL_INVALID_PROGRAM: + return "CL_INVALID_PROGRAM"; + case CL_INVALID_PROGRAM_EXECUTABLE: + return "CL_INVALID_PROGRAM_EXECUTABLE"; + case CL_INVALID_KERNEL_NAME: + return "CL_INVALID_KERNEL_NAME"; + case CL_INVALID_KERNEL_DEFINITION: + return "CL_INVALID_KERNEL_DEFINITION"; + case CL_INVALID_KERNEL: + return "CL_INVALID_KERNEL"; + case CL_INVALID_ARG_INDEX: + return "CL_INVALID_ARG_INDEX"; + case CL_INVALID_ARG_VALUE: + return "CL_INVALID_ARG_VALUE"; + case CL_INVALID_ARG_SIZE: + return "CL_INVALID_ARG_SIZE"; + case CL_INVALID_KERNEL_ARGS: + return "CL_INVALID_KERNEL_ARGS"; + case CL_INVALID_WORK_DIMENSION: + return "CL_INVALID_WORK_DIMENSION"; + case CL_INVALID_WORK_GROUP_SIZE: + return "CL_INVALID_WORK_GROUP_SIZE"; + case CL_INVALID_WORK_ITEM_SIZE: + return "CL_INVALID_WORK_ITEM_SIZE"; + case CL_INVALID_GLOBAL_OFFSET: + return "CL_INVALID_GLOBAL_OFFSET"; + case CL_INVALID_EVENT_WAIT_LIST: + return "CL_INVALID_EVENT_WAIT_LIST"; + case CL_INVALID_EVENT: + return "CL_INVALID_EVENT"; + case CL_INVALID_OPERATION: + return "CL_INVALID_OPERATION"; + case CL_INVALID_GL_OBJECT: + return "CL_INVALID_GL_OBJECT"; + case CL_INVALID_BUFFER_SIZE: + return "CL_INVALID_BUFFER_SIZE"; + case CL_INVALID_MIP_LEVEL: + return "CL_INVALID_MIP_LEVEL"; + case CL_INVALID_GLOBAL_WORK_SIZE: + return "CL_INVALID_GLOBAL_WORK_SIZE"; + case CL_INVALID_PROPERTY: + return "CL_INVALID_PROPERTY"; + #ifdef CL_VERSION_1_2 + case CL_INVALID_IMAGE_DESCRIPTOR: + return "CL_INVALID_IMAGE_DESCRIPTOR"; + case CL_INVALID_COMPILER_OPTIONS: + return "CL_INVALID_COMPILER_OPTIONS"; + case CL_INVALID_LINKER_OPTIONS: + return "CL_INVALID_LINKER_OPTIONS"; + case CL_INVALID_DEVICE_PARTITION_COUNT: + return "CL_INVALID_DEVICE_PARTITION_COUNT"; + #endif + #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2) + case CL_INVALID_PIPE_SIZE: + return "CL_INVALID_PIPE_SIZE"; + case CL_INVALID_DEVICE_QUEUE: + return "CL_INVALID_DEVICE_QUEUE"; + #endif + default: + return "UNKNOWN_ERROR"; + } + } +} + uint32_t getNumPlatforms(); int getAMDPlatformIdx(); std::vector getAMDDevices(int index); size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx); -size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo); -size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo); +size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, const xmrstak_algo& miner_algo, uint64_t height); +size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner_algo); uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment = true); uint64_t updateTimings(GpuContext* ctx, const uint64_t t); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 6a3def72c..2ca09c31c 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -14,6 +14,23 @@ R"===( * along with this program. If not, see . */ +// defines to translate algorithm names int a same number used within cryptonight.h +#define invalid_algo 0 +#define cryptonight 1 +#define cryptonight_lite 2 +#define cryptonight_monero 3 +#define cryptonight_heavy 4 +#define cryptonight_aeon 5 +#define cryptonight_ipbc 6 +#define cryptonight_stellite 7 +#define cryptonight_masari 8 +#define cryptonight_haven 9 +#define cryptonight_bittube2 10 +#define cryptonight_monero_v8 11 +#define cryptonight_superfast 12 +#define cryptonight_gpu 13 +#define cryptonight_conceal 14 + /* For Mesa clover support */ #ifdef cl_clang_storage_class_specifiers # pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable @@ -34,10 +51,10 @@ R"===( */ inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2) { - uint2 result; + uint2 result; result.s0 = (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2)); result.s1 = (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2)); - return result; + return result; } #endif @@ -61,23 +78,278 @@ inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2) */ inline int amd_bfe(const uint src0, const uint offset, const uint width) { - /* casts are removed because we can implement everything as uint - * int offset = src1; - * int width = src2; - * remove check for edge case, this function is always called with - * `width==8` - * @code - * if ( width == 0 ) - * return 0; - * @endcode - */ - if ( (offset + width) < 32u ) - return (src0 << (32u - offset - width)) >> (32u - width); - - return src0 >> offset; + /* casts are removed because we can implement everything as uint + * int offset = src1; + * int width = src2; + * remove check for edge case, this function is always called with + * `width==8` + * @code + * if ( width == 0 ) + * return 0; + * @endcode + */ + if ( (offset + width) < 32u ) + return (src0 << (32u - offset - width)) >> (32u - width); + + return src0 >> offset; +} +#endif + +static const __constant ulong keccakf_rndc[24] = +{ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + +static const __constant uchar sbox[256] = +{ + 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, + 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, + 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, + 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, + 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, + 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, + 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, + 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, + 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, + 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, + 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16 +}; + + +void keccakf1600(ulong *s) +{ + for(int i = 0; i < 24; ++i) + { + ulong bc[5], tmp1, tmp2; + bc[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20] ^ rotate(s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22], 1UL); + bc[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21] ^ rotate(s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23], 1UL); + bc[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22] ^ rotate(s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24], 1UL); + bc[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23] ^ rotate(s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20], 1UL); + bc[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24] ^ rotate(s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21], 1UL); + + tmp1 = s[1] ^ bc[0]; + + s[0] ^= bc[4]; + s[1] = rotate(s[6] ^ bc[0], 44UL); + s[6] = rotate(s[9] ^ bc[3], 20UL); + s[9] = rotate(s[22] ^ bc[1], 61UL); + s[22] = rotate(s[14] ^ bc[3], 39UL); + s[14] = rotate(s[20] ^ bc[4], 18UL); + s[20] = rotate(s[2] ^ bc[1], 62UL); + s[2] = rotate(s[12] ^ bc[1], 43UL); + s[12] = rotate(s[13] ^ bc[2], 25UL); + s[13] = rotate(s[19] ^ bc[3], 8UL); + s[19] = rotate(s[23] ^ bc[2], 56UL); + s[23] = rotate(s[15] ^ bc[4], 41UL); + s[15] = rotate(s[4] ^ bc[3], 27UL); + s[4] = rotate(s[24] ^ bc[3], 14UL); + s[24] = rotate(s[21] ^ bc[0], 2UL); + s[21] = rotate(s[8] ^ bc[2], 55UL); + s[8] = rotate(s[16] ^ bc[0], 35UL); + s[16] = rotate(s[5] ^ bc[4], 36UL); + s[5] = rotate(s[3] ^ bc[2], 28UL); + s[3] = rotate(s[18] ^ bc[2], 21UL); + s[18] = rotate(s[17] ^ bc[1], 15UL); + s[17] = rotate(s[11] ^ bc[0], 10UL); + s[11] = rotate(s[7] ^ bc[1], 6UL); + s[7] = rotate(s[10] ^ bc[4], 3UL); + s[10] = rotate(tmp1, 1UL); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0] ^= keccakf_rndc[i]; + } +} + +static const __constant uint keccakf_rotc[24] = +{ + 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, + 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 +}; + +static const __constant uint keccakf_piln[24] = +{ + 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, + 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 +}; + +inline void keccakf1600_1(ulong st[25]) +{ + int i, round; + ulong t, bc[5]; + + #pragma unroll 1 + for (round = 0; round < 24; ++round) + { + bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL); + bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL); + bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL); + bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL); + bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL); + + st[0] ^= bc[4]; + st[5] ^= bc[4]; + st[10] ^= bc[4]; + st[15] ^= bc[4]; + st[20] ^= bc[4]; + + st[1] ^= bc[0]; + st[6] ^= bc[0]; + st[11] ^= bc[0]; + st[16] ^= bc[0]; + st[21] ^= bc[0]; + + st[2] ^= bc[1]; + st[7] ^= bc[1]; + st[12] ^= bc[1]; + st[17] ^= bc[1]; + st[22] ^= bc[1]; + + st[3] ^= bc[2]; + st[8] ^= bc[2]; + st[13] ^= bc[2]; + st[18] ^= bc[2]; + st[23] ^= bc[2]; + + st[4] ^= bc[3]; + st[9] ^= bc[3]; + st[14] ^= bc[3]; + st[19] ^= bc[3]; + st[24] ^= bc[3]; + + // Rho Pi + t = st[1]; + #pragma unroll + for (i = 0; i < 24; ++i) { + bc[0] = st[keccakf_piln[i]]; + st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]); + t = bc[0]; + } + + #pragma unroll + for(int i = 0; i < 25; i += 5) + { + ulong tmp1 = st[i], tmp2 = st[i + 1]; + + st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]); + st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]); + st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]); + st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]); + st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1); + } + + // Iota + st[0] ^= keccakf_rndc[round]; + } } +)===" +R"===( + +void keccakf1600_2(__local ulong *st) +{ + int i, round; + ulong t, bc[5]; + + #pragma unroll 1 + for (round = 0; round < 24; ++round) + { + bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL); + bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL); + bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL); + bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL); + bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL); + + st[0] ^= bc[4]; + st[5] ^= bc[4]; + st[10] ^= bc[4]; + st[15] ^= bc[4]; + st[20] ^= bc[4]; + + st[1] ^= bc[0]; + st[6] ^= bc[0]; + st[11] ^= bc[0]; + st[16] ^= bc[0]; + st[21] ^= bc[0]; + + st[2] ^= bc[1]; + st[7] ^= bc[1]; + st[12] ^= bc[1]; + st[17] ^= bc[1]; + st[22] ^= bc[1]; + + st[3] ^= bc[2]; + st[8] ^= bc[2]; + st[13] ^= bc[2]; + st[18] ^= bc[2]; + st[23] ^= bc[2]; + + st[4] ^= bc[3]; + st[9] ^= bc[3]; + st[14] ^= bc[3]; + st[19] ^= bc[3]; + st[24] ^= bc[3]; + + // Rho Pi + t = st[1]; + #pragma unroll + for (i = 0; i < 24; ++i) { + bc[0] = st[keccakf_piln[i]]; + st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]); + t = bc[0]; + } + + #pragma unroll + for(int i = 0; i < 25; i += 5) + { + ulong tmp1 = st[i], tmp2 = st[i + 1]; + + st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]); + st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]); + st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]); + st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]); + st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1); + } + + // Iota + st[0] ^= keccakf_rndc[round]; + } +} + +#define MEM_CHUNK (1<> right) | ( a.y << left ), + ((uint)a.y >> right) | ( a.z << left ), + ((uint)a.z >> right) | ( a.w << left ), + ((uint)a.w >> right) | ( a.x << left ) + ); } +#if (ALGO == cryptonight_gpu) + //#include "opencl/cryptonight_gpu.cl" + XMRSTAK_INCLUDE_CN_GPU +#endif + )===" R"===( void CNKeccak(ulong *output, ulong *input) { - ulong st[25]; + ulong st[25]; - // Copy 72 bytes - for(int i = 0; i < 9; ++i) st[i] = input[i]; + // Copy 72 bytes + for(int i = 0; i < 9; ++i) st[i] = input[i]; - // Last four and '1' bit for padding - //st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U)); + // Last four and '1' bit for padding + //st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U)); - st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL; + st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL; - for(int i = 10; i < 25; ++i) st[i] = 0x00UL; + for(int i = 10; i < 25; ++i) st[i] = 0x00UL; - // Last bit of padding - st[16] = 0x8000000000000000UL; + // Last bit of padding + st[16] = 0x8000000000000000UL; - keccakf1600_1(st); + keccakf1600_1(st); - for(int i = 0; i < 25; ++i) output[i] = st[i]; + for(int i = 0; i < 25; ++i) output[i] = st[i]; } static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 }; @@ -344,201 +466,180 @@ static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x void AESExpandKey256(uint *keybuf) { - //#pragma unroll 4 - for(uint c = 8, i = 1; c < 40; ++c) - { - // For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th - uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1]; - - // If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant, - // then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation - // is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done. - keybuf[c] = keybuf[c - 8] ^ ((!(c & 7)) ? rotate(t, 24U) ^ as_uint((uchar4)(rcon[i++], 0U, 0U, 0U)) : t); - } + //#pragma unroll 4 + for(uint c = 8, i = 1; c < 40; ++c) + { + // For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th + uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1]; + + // If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant, + // then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation + // is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done. + keybuf[c] = keybuf[c - 8] ^ ((!(c & 7)) ? rotate(t, 24U) ^ as_uint((uchar4)(rcon[i++], 0U, 0U, 0U)) : t); + } } )===" R"===( -#define MEM_CHUNK (1<> 4); + Scratchpad += gIdx * (MEMORY >> 4); #elif(STRIDED_INDEX==1) Scratchpad += gIdx; #elif(STRIDED_INDEX==2) - Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE); + Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE); #elif(STRIDED_INDEX==3) Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE); #endif - if (get_local_id(1) == 0) - { - __local ulong* State = State_buf + get_local_id(0) * 25; + if (get_local_id(1) == 0) + { + __local ulong* State = State_buf + get_local_id(0) * 25; // NVIDIA #ifdef __NV_CL_C_VERSION for(uint i = 0; i < 8; ++i) State[i] = input[i]; #else - ((__local ulong8 *)State)[0] = vload8(0, input); + ((__local ulong8 *)State)[0] = vload8(0, input); #endif - State[8] = input[8]; - State[9] = input[9]; - State[10] = input[10]; - - ((__local uint *)State)[9] &= 0x00FFFFFFU; - ((__local uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24; - ((__local uint *)State)[10] &= 0xFF000000U; - /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA) - * handle get_global_id and get_global_offset as signed long long int and add - * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1` - * (even if it is correct casted to unsigned on the host) - */ - ((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8)); - - for (int i = 11; i < 25; ++i) { - State[i] = 0x00UL; - } - - // Last bit of padding - State[16] = 0x8000000000000000UL; - - keccakf1600_2(State); - - #pragma unroll - for (int i = 0; i < 25; ++i) { - states[i] = State[i]; - } - } - } - - barrier(CLK_GLOBAL_MEM_FENCE); + State[8] = input[8]; + State[9] = input[9]; + State[10] = input[10]; + + ((__local uint *)State)[9] &= 0x00FFFFFFU; + ((__local uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24; + ((__local uint *)State)[10] &= 0xFF000000U; + /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA) + * handle get_global_id and get_global_offset as signed long long int and add + * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1` + * (even if it is correct casted to unsigned on the host) + */ + ((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8)); + + for (int i = 11; i < 25; ++i) { + State[i] = 0x00UL; + } + + // Last bit of padding + State[16] = 0x8000000000000000UL; + + keccakf1600_2(State); + + #pragma unroll + for (int i = 0; i < 25; ++i) { + states[i] = State[i]; + } + } + } + + barrier(CLK_GLOBAL_MEM_FENCE); # if (COMP_MODE == 1) - // do not use early return here - if (gIdx < Threads) + // do not use early return here + if (gIdx < Threads) # endif - { - text = vload4(get_local_id(1) + 4, (__global uint *)(states)); - - #pragma unroll - for (int i = 0; i < 4; ++i) { - ((ulong *)ExpandedKey1)[i] = states[i]; - } - - AESExpandKey256(ExpandedKey1); - } - - mem_fence(CLK_LOCAL_MEM_FENCE); - -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) - __local uint4 xin[8][8]; - { - - /* Also left over threads perform this loop. - * The left over thread results will be ignored - */ - #pragma unroll 16 - for (size_t i = 0; i < 16; i++) { - #pragma unroll 10 - for (int j = 0; j < 10; ++j) { - uint4 t = ((uint4 *)ExpandedKey1)[j]; - t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)]; - t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)]; - t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)]; - t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)]; - text = t; - } - - barrier(CLK_LOCAL_MEM_FENCE); - xin[get_local_id(1)][get_local_id(0)] = text; - barrier(CLK_LOCAL_MEM_FENCE); - text = mix_and_propagate(xin); - } - } + { + text = vload4(get_local_id(1) + 4, (__global uint *)(states)); + + #pragma unroll + for (int i = 0; i < 4; ++i) { + ((ulong *)ExpandedKey1)[i] = states[i]; + } + + AESExpandKey256(ExpandedKey1); + } + + mem_fence(CLK_LOCAL_MEM_FENCE); + +#if (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + __local uint4 xin[8][8]; + { + + /* Also left over threads perform this loop. + * The left over thread results will be ignored + */ + #pragma unroll 16 + for (size_t i = 0; i < 16; i++) { + #pragma unroll 10 + for (int j = 0; j < 10; ++j) { + uint4 t = ((uint4 *)ExpandedKey1)[j]; + t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)]; + t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)]; + t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)]; + t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)]; + text = t; + } + + barrier(CLK_LOCAL_MEM_FENCE); + xin[get_local_id(1)][get_local_id(0)] = text; + barrier(CLK_LOCAL_MEM_FENCE); + text = mix_and_propagate(xin); + } + } #endif #if(COMP_MODE==1) - // do not use early return here + // do not use early return here if(gIdx < Threads) #endif - { - - #pragma unroll 2 - for(int i = 0; i < (MEMORY >> 4); i += 8) { - #pragma unroll 10 - for (int j = 0; j < 10; ++j) { - uint4 t = ((uint4 *)ExpandedKey1)[j]; - t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)]; - t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)]; - t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)]; - t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)]; - text = t; - } - - Scratchpad[IDX(i + get_local_id(1))] = text; - } - } - mem_fence(CLK_GLOBAL_MEM_FENCE); + { + + #pragma unroll 2 + for(int i = 0; i < (MEMORY >> 4); i += 8) { + #pragma unroll 10 + for (int j = 0; j < 10; ++j) { + uint4 t = ((uint4 *)ExpandedKey1)[j]; + t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)]; + t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)]; + t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)]; + t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)]; + text = t; + } + + Scratchpad[IDX(i + get_local_id(1))] = text; + } + } + mem_fence(CLK_GLOBAL_MEM_FENCE); } )===" R"===( -// cryptonight_monero_v8 && NVIDIA -#if(ALGO==11 && defined(__NV_CL_C_VERSION)) +// __NV_CL_C_VERSION checks if NVIDIA opencl is used +#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION)) # define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4)))) # define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) #else @@ -547,16 +648,18 @@ R"===( __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, uint Threads -// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 -#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) + +#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) , __global ulong *input #endif ) { ulong a[2]; +#if(ALGO == cryptonight_conceal) + float4 conc_var = (float4)(0.0f); +#endif -// cryptonight_monero_v8 -#if(ALGO==11) +#if(ALGO == cryptonight_monero_v8) ulong b[4]; uint4 b_x[2]; // NVIDIA @@ -568,123 +671,134 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ulong b[2]; uint4 b_x[1]; #endif - __local uint AES0[256], AES1[256]; + __local uint AES0[256], AES1[256]; -// cryptonight_monero_v8 -#if(ALGO==11) +#if(ALGO == cryptonight_monero_v8) # if defined(__clang__) && !defined(__NV_CL_C_VERSION) - __local uint RCP[256]; + __local uint RCP[256]; # endif uint2 division_result; uint sqrt_result; #endif - const uint gIdx = getIdx(); + const uint gIdx = getIdx(); for(int i = get_local_id(0); i < 256; i += WORKSIZE) { - const uint tmp = AES0_C[i]; - AES0[i] = tmp; - AES1[i] = rotate(tmp, 8U); -// cryptonight_monero_v8 -#if(ALGO==11 && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) + const uint tmp = AES0_C[i]; + AES0[i] = tmp; + AES1[i] = rotate(tmp, 8U); + +#if(ALGO == cryptonight_monero_v8 && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) RCP[i] = RCP_C[i]; #endif - } + } + + barrier(CLK_LOCAL_MEM_FENCE); - barrier(CLK_LOCAL_MEM_FENCE); -// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 -#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) - uint2 tweak1_2; +#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) + uint2 tweak1_2; #endif #if(COMP_MODE==1) - // do not use early return here + // do not use early return here if(gIdx < Threads) #endif - { - states += 25 * gIdx; + { + states += 25 * gIdx; #if(STRIDED_INDEX==0) - Scratchpad += gIdx * (MEMORY >> 4); + Scratchpad += gIdx * (MEMORY >> 4); #elif(STRIDED_INDEX==1) Scratchpad += gIdx; #elif(STRIDED_INDEX==2) - Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0); + Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0); #elif(STRIDED_INDEX==3) Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE); #endif - a[0] = states[0] ^ states[4]; - b[0] = states[2] ^ states[6]; - a[1] = states[1] ^ states[5]; - b[1] = states[3] ^ states[7]; + a[0] = states[0] ^ states[4]; + b[0] = states[2] ^ states[6]; + a[1] = states[1] ^ states[5]; + b[1] = states[3] ^ states[7]; b_x[0] = ((uint4 *)b)[0]; -// cryptonight_monero_v8 -#if(ALGO==11) - a[1] = states[1] ^ states[5]; - b[2] = states[8] ^ states[10]; - b[3] = states[9] ^ states[11]; +#if(ALGO == cryptonight_monero_v8) + a[1] = states[1] ^ states[5]; + b[2] = states[8] ^ states[10]; + b[3] = states[9] ^ states[11]; b_x[1] = ((uint4 *)b)[1]; division_result = as_uint2(states[12]); sqrt_result = as_uint2(states[13]).s0; #endif -// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 -#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) + +#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) tweak1_2 = as_uint2(input[4]); tweak1_2.s0 >>= 24; tweak1_2.s0 |= tweak1_2.s1 << 8; tweak1_2.s1 = (uint)get_global_id(0); tweak1_2 ^= as_uint2(states[24]); #endif - } + } - mem_fence(CLK_LOCAL_MEM_FENCE); + mem_fence(CLK_LOCAL_MEM_FENCE); #if(COMP_MODE==1) - // do not use early return here + // do not use early return here if(gIdx < Threads) #endif - { + { uint idx0 = as_uint2(a[0]).s0 & MASK; #pragma unroll CN_UNROLL - for(int i = 0; i < ITERATIONS; ++i) - { + for(int i = 0; i < ITERATIONS; ++i) + { ulong c[2]; -// cryptonight_monero_v8 && NVIDIA -#if(ALGO==11 && defined(__NV_CL_C_VERSION)) + +#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION)) uint idxS = idx0 & 0x30U; *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; #endif ((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0); -// cryptonight_bittube2 -#if(ALGO == 10) + +#if(ALGO == cryptonight_conceal) + float4 r = convert_float4_rte(((int4 *)c)[0]); + float4 c_old = conc_var; + r = _mm_add_ps(r, conc_var); + r = _mm_mul_ps(r, _mm_mul_ps(r, r)); + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + conc_var = _mm_add_ps(conc_var, r); + + c_old = _mm_and_ps(c_old, 0x807FFFFF); + c_old = _mm_or_ps(c_old, 0x40000000); + float4 nc = _mm_mul_ps(c_old, (float4)(536870880.0f)); + ((int4 *)c)[0] ^= convert_int4_rte(nc); +#endif + +#if(ALGO == cryptonight_bittube2) ((uint4 *)c)[0] = AES_Round2_bittube2(AES0, AES1, ~((uint4 *)c)[0], ((uint4 *)a)[0]); #else ((uint4 *)c)[0] = AES_Round2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]); #endif -// cryptonight_monero_v8 -#if(ALGO==11) - { +#if(ALGO == cryptonight_monero_v8) + { ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); - SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); - } + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } #endif -// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 -#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) +#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) uint table = 0x75310U; b_x[0] ^= ((uint4 *)c)[0]; -// cryptonight_stellite -# if(ALGO == 7) + +# if(ALGO == cryptonight_stellite) uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2); # else uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2); @@ -692,8 +806,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states b_x[0].s2 ^= ((table >> index) & 0x30U) << 24; SCRATCHPAD_CHUNK(0) = b_x[0]; idx0 = as_uint2(c[0]).s0 & MASK; -// cryptonight_monero_v8 -#elif(ALGO==11) + +#elif(ALGO == cryptonight_monero_v8) SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; # ifdef __NV_CL_C_VERSION // flush shuffled data @@ -711,11 +825,11 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif uint4 tmp; tmp = SCRATCHPAD_CHUNK(0); -// cryptonight_monero_v8 -#if(ALGO==11) + +#if(ALGO == cryptonight_monero_v8) // Use division and square root results from the _previous_ iteration to hide the latency - tmp.s0 ^= division_result.s0; - tmp.s1 ^= division_result.s1 ^ sqrt_result; + tmp.s0 ^= division_result.s0; + tmp.s1 ^= division_result.s1 ^ sqrt_result; // Most and least significant bits in the divisor are set to 1 // to make sure we don't divide by a small or even number, // so there are no shortcuts for such cases @@ -748,11 +862,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states a[1] += c[0] * as_ulong2(tmp).s0; a[0] += mul_hi(c[0], as_ulong2(tmp).s0); #endif -// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 -#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) -// cryptonight_ipbc || cryptonight_bittube2 -# if(ALGO == 6 || ALGO == 10) +#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) + +# if(ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; @@ -767,10 +880,9 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; #endif - ((uint4 *)a)[0] ^= tmp; + ((uint4 *)a)[0] ^= tmp; -// cryptonight_monero_v8 -#if (ALGO == 11) +#if (ALGO == cryptonight_monero_v8) # if defined(__NV_CL_C_VERSION) // flush shuffled data SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; @@ -780,15 +892,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states b_x[0] = ((uint4 *)c)[0]; idx0 = as_uint2(a[0]).s0 & MASK; -// cryptonight_heavy || cryptonight_bittube2 -#if (ALGO == 4 || ALGO == 10) +#if (ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; - long q = fast_div_heavy(n, d | 0x5); + long q = fast_div_heavy(n, d | 0x5); *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; idx0 = (d ^ as_int2(q).s0) & MASK; -// cryptonight_haven || cryptonight_superfast -#elif (ALGO == 9 || ALGO == 12) +#elif (ALGO == cryptonight_haven || ALGO == cryptonight_superfast) long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = fast_div_heavy(n, d | 0x5); @@ -796,461 +906,504 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states idx0 = ((~d) ^ as_int2(q).s0) & MASK; #endif - } - } - mem_fence(CLK_GLOBAL_MEM_FENCE); + } + } + mem_fence(CLK_GLOBAL_MEM_FENCE); } )===" R"===( __attribute__((reqd_work_group_size(8, 8, 1))) -__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads) +__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, + +#if (ALGO == cryptonight_gpu) + __global uint *output, ulong Target, uint Threads) +#else + __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads) +#endif { - __local uint AES0[256], AES1[256], AES2[256], AES3[256]; - uint ExpandedKey2[40]; - uint4 text; - - const uint gIdx = getIdx(); - - for (int i = get_local_id(1) * 8 + get_local_id(0); i < 256; i += 8 * 8) { - const uint tmp = AES0_C[i]; - AES0[i] = tmp; - AES1[i] = rotate(tmp, 8U); - AES2[i] = rotate(tmp, 16U); - AES3[i] = rotate(tmp, 24U); - } - - barrier(CLK_LOCAL_MEM_FENCE); - -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) - __local uint4 xin1[8][8]; - __local uint4 xin2[8][8]; + __local uint AES0[256], AES1[256], AES2[256], AES3[256]; + uint ExpandedKey2[40]; + uint4 text; + + const uint gIdx = getIdx(); + + for (int i = get_local_id(1) * 8 + get_local_id(0); i < 256; i += 8 * 8) { + const uint tmp = AES0_C[i]; + AES0[i] = tmp; + AES1[i] = rotate(tmp, 8U); + AES2[i] = rotate(tmp, 16U); + AES3[i] = rotate(tmp, 24U); + } + + barrier(CLK_LOCAL_MEM_FENCE); + +#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + __local uint4 xin1[8][8]; + __local uint4 xin2[8][8]; #endif #if(COMP_MODE==1) - // do not use early return here - if(gIdx < Threads) + // do not use early return here + if(gIdx < Threads) #endif - { - states += 25 * gIdx; + { + states += 25 * gIdx; #if(STRIDED_INDEX==0) - Scratchpad += gIdx * (MEMORY >> 4); + Scratchpad += gIdx * (MEMORY >> 4); #elif(STRIDED_INDEX==1) Scratchpad += gIdx; #elif(STRIDED_INDEX==2) - Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE); + Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE); #elif(STRIDED_INDEX==3) Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE); #endif - #if defined(__Tahiti__) || defined(__Pitcairn__) + #if defined(__Tahiti__) || defined(__Pitcairn__) - for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4]; - text = vload4(get_local_id(1) + 4, (__global uint *)states); + for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4]; + text = vload4(get_local_id(1) + 4, (__global uint *)states); - #else + #else - text = vload4(get_local_id(1) + 4, (__global uint *)states); - ((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states); + text = vload4(get_local_id(1) + 4, (__global uint *)states); + ((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states); - #endif + #endif - AESExpandKey256(ExpandedKey2); - } + AESExpandKey256(ExpandedKey2); + } - barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) - __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)]; - __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)]; - __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)]; - __local uint4* xin2_load = &xin2[(get_local_id(1) + 1) % 8][get_local_id(0)]; - *xin2_store = (uint4)(0, 0, 0, 0); +#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)]; + __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)]; + __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)]; + __local uint4* xin2_load = &xin2[(get_local_id(1) + 1) % 8][get_local_id(0)]; + *xin2_store = (uint4)(0, 0, 0, 0); #endif #if(COMP_MODE == 1) - // do not use early return here - if (gIdx < Threads) + // do not use early return here + if (gIdx < Threads) #endif - { -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) - #pragma unroll 2 - for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4)) - { - text ^= Scratchpad[IDX((uint)i1)]; - barrier(CLK_LOCAL_MEM_FENCE); - text ^= *xin2_load; + { + +#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + #pragma unroll 2 + for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4)) + { + text ^= Scratchpad[IDX((uint)i1)]; + barrier(CLK_LOCAL_MEM_FENCE); + text ^= *xin2_load; - #pragma unroll 10 - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - *xin1_store = text; + *xin1_store = text; - text ^= Scratchpad[IDX((uint)i1 + 8u)]; - barrier(CLK_LOCAL_MEM_FENCE); - text ^= *xin1_load; + text ^= Scratchpad[IDX((uint)i1 + 8u)]; + barrier(CLK_LOCAL_MEM_FENCE); + text ^= *xin1_load; - #pragma unroll 10 - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - *xin2_store = text; - } + *xin2_store = text; + } - barrier(CLK_LOCAL_MEM_FENCE); - text ^= *xin2_load; + barrier(CLK_LOCAL_MEM_FENCE); + text ^= *xin2_load; #else - #pragma unroll 2 - for (int i = 0; i < (MEMORY >> 7); ++i) { - text ^= Scratchpad[IDX((uint)((i << 3) + get_local_id(1)))]; - - #pragma unroll 10 - for(int j = 0; j < 10; ++j) - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - } + #pragma unroll 2 + for (int i = 0; i < (MEMORY >> 7); ++i) { + text ^= Scratchpad[IDX((uint)((i << 3) + get_local_id(1)))]; + + #pragma unroll 10 + for(int j = 0; j < 10; ++j) + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + } #endif - } - -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast -#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) - /* Also left over threads performe this loop. - * The left over thread results will be ignored - */ - #pragma unroll 16 - for(size_t i = 0; i < 16; i++) - { - #pragma unroll 10 - for (int j = 0; j < 10; ++j) { - text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); - } - - barrier(CLK_LOCAL_MEM_FENCE); - *xin1_store = text; - barrier(CLK_LOCAL_MEM_FENCE); - text ^= *xin1_load; - } + } + +#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + /* Also left over threads performe this loop. + * The left over thread results will be ignored + */ + #pragma unroll 16 + for(size_t i = 0; i < 16; i++) + { + #pragma unroll 10 + for (int j = 0; j < 10; ++j) { + text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]); + } + + barrier(CLK_LOCAL_MEM_FENCE); + *xin1_store = text; + barrier(CLK_LOCAL_MEM_FENCE); + text ^= *xin1_load; + } #endif - __local ulong State_buf[8 * 25]; + __local ulong State_buf[8 * 25]; #if(COMP_MODE==1) - // do not use early return here - if(gIdx < Threads) + // do not use early return here + if(gIdx < Threads) #endif - { - vstore2(as_ulong2(text), get_local_id(1) + 4, states); - } + { + vstore2(as_ulong2(text), get_local_id(1) + 4, states); + } - barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); #if(COMP_MODE==1) - // do not use early return here - if(gIdx < Threads) + // do not use early return here + if(gIdx < Threads) #endif - { - if(!get_local_id(1)) - { - __local ulong* State = State_buf + get_local_id(0) * 25; + { + if(!get_local_id(1)) + { + __local ulong* State = State_buf + get_local_id(0) * 25; - for(int i = 0; i < 25; ++i) State[i] = states[i]; + for(int i = 0; i < 25; ++i) State[i] = states[i]; - keccakf1600_2(State); + keccakf1600_2(State); - for(int i = 0; i < 25; ++i) states[i] = State[i]; +#if (ALGO == cryptonight_gpu) + if(State[3] <= Target) + { + ulong outIdx = atomic_inc(output + 0xFF); + if(outIdx < 0xFF) + output[outIdx] = get_global_id(0); + } +#else + for(int i = 0; i < 25; ++i) states[i] = State[i]; - uint StateSwitch = State[0] & 3; - __global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1; - __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3; - __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2; - destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx; - } - } - mem_fence(CLK_GLOBAL_MEM_FENCE); + uint StateSwitch = State[0] & 3; + __global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1; + __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3; + __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2; + destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx; +#endif + } + } + mem_fence(CLK_GLOBAL_MEM_FENCE); } )===" R"===( #define VSWAP8(x) (((x) >> 56) | (((x) >> 40) & 0x000000000000FF00UL) | (((x) >> 24) & 0x0000000000FF0000UL) \ - | (((x) >> 8) & 0x00000000FF000000UL) | (((x) << 8) & 0x000000FF00000000UL) \ - | (((x) << 24) & 0x0000FF0000000000UL) | (((x) << 40) & 0x00FF000000000000UL) | (((x) << 56) & 0xFF00000000000000UL)) + | (((x) >> 8) & 0x00000000FF000000UL) | (((x) << 8) & 0x000000FF00000000UL) \ + | (((x) << 24) & 0x0000FF0000000000UL) | (((x) << 40) & 0x00FF000000000000UL) | (((x) << 56) & 0xFF00000000000000UL)) #define VSWAP4(x) ((((x) >> 24) & 0xFFU) | (((x) >> 8) & 0xFF00U) | (((x) << 8) & 0xFF0000U) | (((x) << 24) & 0xFF000000U)) __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads) { - const ulong idx = get_global_id(0) - get_global_offset(0); - - // do not use early return here - if(idx < BranchBuf[Threads]) - { - states += 25 * BranchBuf[idx]; + const uint idx = get_global_id(0) - get_global_offset(0); - // skein - ulong8 h = vload8(0, SKEIN512_256_IV); + // do not use early return here + if(idx < BranchBuf[Threads]) + { + states += 25 * BranchBuf[idx]; - // Type field begins with final bit, first bit, then six bits of type; the last 96 - // bits are input processed (including in the block to be processed with that tweak) - // The output transform is only one run of UBI, since we need only 256 bits of output - // The tweak for the output transform is Type = Output with the Final bit set - // T[0] for the output is 8, and I don't know why - should be message size... - ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL }; - ulong8 p, m; + // skein + ulong8 h = vload8(0, SKEIN512_256_IV); - for(uint i = 0; i < 4; ++i) - { - t[0] += i < 3 ? 0x40UL : 0x08UL; + // Type field begins with final bit, first bit, then six bits of type; the last 96 + // bits are input processed (including in the block to be processed with that tweak) + // The output transform is only one run of UBI, since we need only 256 bits of output + // The tweak for the output transform is Type = Output with the Final bit set + // T[0] for the output is 8, and I don't know why - should be message size... + ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL }; + ulong8 p, m; - t[2] = t[0] ^ t[1]; + #pragma unroll 1 + for (uint i = 0; i < 4; ++i) + { + t[0] += i < 3 ? 0x40UL : 0x08UL; - m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); - const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY; - p = Skein512Block(m, h, h8, t); + t[2] = t[0] ^ t[1]; - h = m ^ p; + m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL); + const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY; + p = Skein512Block(m, h, h8, t); - t[1] = i < 2 ? 0x3000000000000000UL : 0xB000000000000000UL; - } + h = m ^ p; - t[0] = 0x08UL; - t[1] = 0xFF00000000000000UL; - t[2] = t[0] ^ t[1]; + t[1] = i < 2 ? 0x3000000000000000UL : 0xB000000000000000UL; + } - p = (ulong8)(0); - const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY; + t[0] = 0x08UL; + t[1] = 0xFF00000000000000UL; + t[2] = t[0] ^ t[1]; - p = Skein512Block(p, h, h8, t); + p = (ulong8)(0); + const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY; - //vstore8(p, 0, output); + p = Skein512Block(p, h, h8, t); - // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values - // and expect an accurate result for target > 32-bit without implementing carries - if(p.s3 <= Target) + // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values + // and expect an accurate result for target > 32-bit without implementing carries + if (p.s3 <= Target) { - ulong outIdx = atomic_inc(output + 0xFF); + ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) - output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); - } - } - mem_fence(CLK_GLOBAL_MEM_FENCE); + output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); + } + } + mem_fence(CLK_GLOBAL_MEM_FENCE); } #define SWAP8(x) as_ulong(as_uchar8(x).s76543210) #define JHXOR \ - h0h ^= input[0]; \ - h0l ^= input[1]; \ - h1h ^= input[2]; \ - h1l ^= input[3]; \ - h2h ^= input[4]; \ - h2l ^= input[5]; \ - h3h ^= input[6]; \ - h3l ^= input[7]; \ + h0h ^= input[0]; \ + h0l ^= input[1]; \ + h1h ^= input[2]; \ + h1l ^= input[3]; \ + h2h ^= input[4]; \ + h2l ^= input[5]; \ + h3h ^= input[6]; \ + h3l ^= input[7]; \ \ - E8; \ + E8; \ \ - h4h ^= input[0]; \ - h4l ^= input[1]; \ - h5h ^= input[2]; \ - h5l ^= input[3]; \ - h6h ^= input[4]; \ - h6l ^= input[5]; \ - h7h ^= input[6]; \ - h7l ^= input[7] + h4h ^= input[0]; \ + h4l ^= input[1]; \ + h5h ^= input[2]; \ + h5l ^= input[3]; \ + h6h ^= input[4]; \ + h6l ^= input[5]; \ + h7h ^= input[6]; \ + h7l ^= input[7] __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads) { - const uint idx = get_global_id(0) - get_global_offset(0); - - // do not use early return here - if(idx < BranchBuf[Threads]) - { - states += 25 * BranchBuf[idx]; - - sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL; - sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL; - sph_u64 tmp; - - for(int i = 0; i < 3; ++i) - { - ulong input[8]; - - const int shifted = i << 3; - for(int x = 0; x < 8; ++x) input[x] = (states[shifted + x]); - JHXOR; - } - { - ulong input[8]; - input[0] = (states[24]); - input[1] = 0x80UL; - #pragma unroll 6 - for(int x = 2; x < 8; ++x) input[x] = 0x00UL; - JHXOR; - } - { - ulong input[8]; - for(int x = 0; x < 7; ++x) input[x] = 0x00UL; - input[7] = 0x4006000000000000UL; - JHXOR; - } - - //output[0] = h6h; - //output[1] = h6l; - //output[2] = h7h; - //output[3] = h7l; - - // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values - // and expect an accurate result for target > 32-bit without implementing carries + const uint idx = get_global_id(0) - get_global_offset(0); + + // do not use early return here + if(idx < BranchBuf[Threads]) + { + states += 25 * BranchBuf[idx]; + + sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL; + sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL; + sph_u64 tmp; + + for(uint i = 0; i < 3; ++i) + { + ulong input[8]; + + const int shifted = i << 3; + for (uint x = 0; x < 8; ++x) + { + input[x] = (states[shifted + x]); + } + + JHXOR; + } + + { + ulong input[8] = { (states[24]), 0x80UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL }; + JHXOR; + } + + { + ulong input[8] = { 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x4006000000000000UL }; + JHXOR; + } + + // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values + // and expect an accurate result for target > 32-bit without implementing carries if(h7l <= Target) { - ulong outIdx = atomic_inc(output + 0xFF); + ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); - } - } - } + } + } +} #define SWAP4(x) as_uint(as_uchar4(x).s3210) __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads) { - const uint idx = get_global_id(0) - get_global_offset(0); - - // do not use early return here - if(idx < BranchBuf[Threads]) - { - states += 25 * BranchBuf[idx]; - - unsigned int m[16]; - unsigned int v[16]; - uint h[8]; - - ((uint8 *)h)[0] = vload8(0U, c_IV256); - - #pragma unroll 4 - for(uint i = 0, bitlen = 0; i < 4; ++i) - { - if(i < 3) - { - ((uint16 *)m)[0] = vload16(i, (__global uint *)states); - for(int i = 0; i < 16; ++i) m[i] = SWAP4(m[i]); - bitlen += 512; - } - else - { - m[0] = SWAP4(((__global uint *)states)[48]); - m[1] = SWAP4(((__global uint *)states)[49]); - m[2] = 0x80000000U; - - for(int i = 3; i < 13; ++i) m[i] = 0x00U; - - m[13] = 1U; - m[14] = 0U; - m[15] = 0x640; - bitlen += 64; - } - - ((uint16 *)v)[0].lo = ((uint8 *)h)[0]; - ((uint16 *)v)[0].hi = vload8(0U, c_u256); - - //v[12] ^= (i < 3) ? (i + 1) << 9 : 1600U; - //v[13] ^= (i < 3) ? (i + 1) << 9 : 1600U; - - v[12] ^= bitlen; - v[13] ^= bitlen; - - for(int r = 0; r < 14; r++) - { - GS(0, 4, 0x8, 0xC, 0x0); - GS(1, 5, 0x9, 0xD, 0x2); - GS(2, 6, 0xA, 0xE, 0x4); - GS(3, 7, 0xB, 0xF, 0x6); - GS(0, 5, 0xA, 0xF, 0x8); - GS(1, 6, 0xB, 0xC, 0xA); - GS(2, 7, 0x8, 0xD, 0xC); - GS(3, 4, 0x9, 0xE, 0xE); - } - - ((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1]; - } - - for(int i = 0; i < 8; ++i) h[i] = SWAP4(h[i]); - - // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values - // and expect an accurate result for target > 32-bit without implementing carries - uint2 t = (uint2)(h[6],h[7]); - if( as_ulong(t) <= Target) + const uint idx = get_global_id(0) - get_global_offset(0); + + // do not use early return here + if(idx < BranchBuf[Threads]) + { + states += 25 * BranchBuf[idx]; + + unsigned int m[16]; + unsigned int v[16]; + uint h[8]; + uint bitlen = 0; + + ((uint8 *)h)[0] = vload8(0U, c_IV256); + + for (uint i = 0; i < 3; ++i) + { + ((uint16 *)m)[0] = vload16(i, (__global uint *)states); + for (uint x = 0; x < 16; ++x) + { + m[x] = SWAP4(m[x]); + } + + bitlen += 512; + + ((uint16 *)v)[0].lo = ((uint8 *)h)[0]; + ((uint16 *)v)[0].hi = vload8(0U, c_u256); + + v[12] ^= bitlen; + v[13] ^= bitlen; + + for (uint r = 0; r < 14; r++) { + GS(0, 4, 0x8, 0xC, 0x0); + GS(1, 5, 0x9, 0xD, 0x2); + GS(2, 6, 0xA, 0xE, 0x4); + GS(3, 7, 0xB, 0xF, 0x6); + GS(0, 5, 0xA, 0xF, 0x8); + GS(1, 6, 0xB, 0xC, 0xA); + GS(2, 7, 0x8, 0xD, 0xC); + GS(3, 4, 0x9, 0xE, 0xE); + } + + ((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1]; + } + + m[0] = SWAP4(((__global uint *)states)[48]); + m[1] = SWAP4(((__global uint *)states)[49]); + m[2] = 0x80000000U; + m[3] = 0x00U; + m[4] = 0x00U; + m[5] = 0x00U; + m[6] = 0x00U; + m[7] = 0x00U; + m[8] = 0x00U; + m[9] = 0x00U; + m[10] = 0x00U; + m[11] = 0x00U; + m[12] = 0x00U; + m[13] = 1U; + m[14] = 0U; + m[15] = 0x640; + + bitlen += 64; + + ((uint16 *)v)[0].lo = ((uint8 *)h)[0]; + ((uint16 *)v)[0].hi = vload8(0U, c_u256); + + v[12] ^= bitlen; + v[13] ^= bitlen; + + for (uint r = 0; r < 14; r++) { + GS(0, 4, 0x8, 0xC, 0x0); + GS(1, 5, 0x9, 0xD, 0x2); + GS(2, 6, 0xA, 0xE, 0x4); + GS(3, 7, 0xB, 0xF, 0x6); + GS(0, 5, 0xA, 0xF, 0x8); + GS(1, 6, 0xB, 0xC, 0xA); + GS(2, 7, 0x8, 0xD, 0xC); + GS(3, 4, 0x9, 0xE, 0xE); + } + + ((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1]; + + for (uint i = 0; i < 8; ++i) { + h[i] = SWAP4(h[i]); + } + + // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values + // and expect an accurate result for target > 32-bit without implementing carries + uint2 t = (uint2)(h[6],h[7]); + if(as_ulong(t) <= Target) { - ulong outIdx = atomic_inc(output + 0xFF); + ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); - } - } - } - -__kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads) -{ - const uint idx = get_global_id(0) - get_global_offset(0); + } + } +} - // do not use early return here - if(idx < BranchBuf[Threads]) - { - states += 25 * BranchBuf[idx]; +#undef SWAP4 - ulong State[8]; - for(int i = 0; i < 7; ++i) State[i] = 0UL; +__kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads) +{ + const uint idx = get_global_id(0) - get_global_offset(0); - State[7] = 0x0001000000000000UL; + // do not use early return here + if(idx < BranchBuf[Threads]) + { + states += 25 * BranchBuf[idx]; - #pragma unroll 4 - for(uint i = 0; i < 4; ++i) - { - volatile ulong H[8], M[8]; + ulong State[8] = { 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0x0001000000000000UL }; +#if defined(__clang__) && !defined(__NV_CL_C_VERSION) + // on ROCM we need volatile for AMD RX5xx cards to avoid invalid shares + volatile +#endif + ulong H[8], M[8]; - if(i < 3) - { - ((ulong8 *)M)[0] = vload8(i, states); - } - else - { - M[0] = states[24]; - M[1] = 0x80UL; + for (uint i = 0; i < 3; ++i) { + ((ulong8 *)M)[0] = vload8(i, states); - for(int x = 2; x < 7; ++x) M[x] = 0UL; + for (uint x = 0; x < 8; ++x) { + H[x] = M[x] ^ State[x]; + } - M[7] = 0x0400000000000000UL; - } + PERM_SMALL_P(H); + PERM_SMALL_Q(M); - for(int x = 0; x < 8; ++x) H[x] = M[x] ^ State[x]; + for (uint x = 0; x < 8; ++x) + { + State[x] ^= H[x] ^ M[x]; + } + } - PERM_SMALL_P(H); - PERM_SMALL_Q(M); + M[0] = states[24]; + M[1] = 0x80UL; + M[2] = 0UL; + M[3] = 0UL; + M[4] = 0UL; + M[5] = 0UL; + M[6] = 0UL; + M[7] = 0x0400000000000000UL; - for(int x = 0; x < 8; ++x) State[x] ^= H[x] ^ M[x]; - } + for (uint x = 0; x < 8; ++x) { + H[x] = M[x] ^ State[x]; + } - ulong tmp[8]; + PERM_SMALL_P(H); + PERM_SMALL_Q(M); - for(int i = 0; i < 8; ++i) tmp[i] = State[i]; + ulong tmp[8]; + for (uint i = 0; i < 8; ++i) { + tmp[i] = State[i] ^= H[i] ^ M[i]; + } - PERM_SMALL_P(State); + PERM_SMALL_P(State); - for(int i = 0; i < 8; ++i) State[i] ^= tmp[i]; + for (uint i = 0; i < 8; ++i) { + State[i] ^= tmp[i]; + } - // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values - // and expect an accurate result for target > 32-bit without implementing carries + // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values + // and expect an accurate result for target > 32-bit without implementing carries if(State[7] <= Target) { - ulong outIdx = atomic_inc(output + 0xFF); + ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); - } - } - } + } + } +} )===" diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl new file mode 100644 index 000000000..e87819760 --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl @@ -0,0 +1,329 @@ +R"===( + +inline global int4* scratchpad_ptr(uint idx, uint n, __global int *lpad) { return (__global int4*)((__global char*)lpad + (idx & MASK) + n * 16); } + +inline float4 fma_break(float4 x) +{ + // Break the dependency chain by setitng the exp to ?????01 + x = _mm_and_ps(x, 0xFEFFFFFF); + return _mm_or_ps(x, 0x00800000); +} + +inline void sub_round(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* n, float4* d, float4* c) +{ + n1 = _mm_add_ps(n1, *c); + float4 nn = _mm_mul_ps(n0, *c); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = fma_break(nn); + *n = _mm_add_ps(*n, nn); + + n3 = _mm_sub_ps(n3, *c); + float4 dd = _mm_mul_ps(n2, *c); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = fma_break(dd); + *d = _mm_add_ps(*d, dd); + + //Constant feedback + *c = _mm_add_ps(*c, rnd_c); + *c = _mm_add_ps(*c, (float4)(0.734375f)); + float4 r = _mm_add_ps(nn, dd); + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + *c = _mm_add_ps(*c, r); + +} + +// 9*8 + 2 = 74 +inline void round_compute(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* c, float4* r) +{ + float4 n = (float4)(0.0f); + float4 d = (float4)(0.0f); + + sub_round(n0, n1, n2, n3, rnd_c, &n, &d, c); + sub_round(n1, n2, n3, n0, rnd_c, &n, &d, c); + sub_round(n2, n3, n0, n1, rnd_c, &n, &d, c); + sub_round(n3, n0, n1, n2, rnd_c, &n, &d, c); + sub_round(n3, n2, n1, n0, rnd_c, &n, &d, c); + sub_round(n2, n1, n0, n3, rnd_c, &n, &d, c); + sub_round(n1, n0, n3, n2, rnd_c, &n, &d, c); + sub_round(n0, n3, n2, n1, rnd_c, &n, &d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm_and_ps(d, 0xFF7FFFFF); + d = _mm_or_ps(d, 0x40000000); + *r =_mm_add_ps(*r, _mm_div_ps(n,d)); +} + +inline int4 single_comupte(float4 n0, float4 n1, float4 n2, float4 n3, float cnt, float4 rnd_c, __local float4* sum) +{ + float4 c= (float4)(cnt); + // 35 maths calls follow (140 FLOPS) + float4 r = (float4)(0.0f); + + for(int i = 0; i < 4; ++i) + round_compute(n0, n1, n2, n3, rnd_c, &c, &r); + + // do a quick fmod by setting exp to 2 + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + *sum = r; // 34 + float4 x = (float4)(536870880.0f); + r = _mm_mul_ps(r, x); // 35 + return convert_int4_rte(r); +} + +inline void single_comupte_wrap(const uint rot, int4 v0, int4 v1, int4 v2, int4 v3, float cnt, float4 rnd_c, __local float4* sum, __local int4* out) +{ + float4 n0 = convert_float4_rte(v0); + float4 n1 = convert_float4_rte(v1); + float4 n2 = convert_float4_rte(v2); + float4 n3 = convert_float4_rte(v3); + + int4 r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); + *out = rot == 0 ? r : _mm_alignr_epi8(r, rot); +} + +)===" +R"===( + +static const __constant uint look[16][4] = { + {0, 1, 2, 3}, + {0, 2, 3, 1}, + {0, 3, 1, 2}, + {0, 3, 2, 1}, + + {1, 0, 2, 3}, + {1, 2, 3, 0}, + {1, 3, 0, 2}, + {1, 3, 2, 0}, + + {2, 1, 0, 3}, + {2, 0, 3, 1}, + {2, 3, 1, 0}, + {2, 3, 0, 1}, + + {3, 1, 2, 0}, + {3, 2, 0, 1}, + {3, 0, 1, 2}, + {3, 0, 2, 1} +}; + +static const __constant float ccnt[16] = { + 1.34375f, + 1.28125f, + 1.359375f, + 1.3671875f, + + 1.4296875f, + 1.3984375f, + 1.3828125f, + 1.3046875f, + + 1.4140625f, + 1.2734375f, + 1.2578125f, + 1.2890625f, + + 1.3203125f, + 1.3515625f, + 1.3359375f, + 1.4609375f +}; + +struct SharedMemChunk +{ + int4 out[16]; + float4 va[16]; +}; + +__attribute__((reqd_work_group_size(WORKSIZE * 16, 1, 1))) +__kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, uint numThreads) +{ + const uint gIdx = getIdx(); + +#if(COMP_MODE==1) + if(gIdx/16 >= numThreads) + return; +#endif + + uint chunk = get_local_id(0) / 16; + +#if(STRIDED_INDEX==0) + __global int* lpad = (__global int*)((__global char*)lpad_in + MEMORY * (gIdx/16)); +#endif + + __local struct SharedMemChunk smem_in[WORKSIZE]; + __local struct SharedMemChunk* smem = smem_in + chunk; + + uint tid = get_local_id(0) % 16; + + uint idxHash = gIdx/16; + uint s = ((__global uint*)spad)[idxHash * 50] >> 8; + float4 vs = (float4)(0); + + // tid divided + const uint tidd = tid / 4; + // tid modulo + const uint tidm = tid % 4; + const uint block = tidd * 16 + tidm; + + #pragma unroll CN_UNROLL + for(size_t i = 0; i < ITERATIONS; i++) + { + mem_fence(CLK_LOCAL_MEM_FENCE); + int tmp = ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm]; + ((__local int*)(smem->out))[tid] = tmp; + mem_fence(CLK_LOCAL_MEM_FENCE); + + { + single_comupte_wrap( + tidm, + *(smem->out + look[tid][0]), + *(smem->out + look[tid][1]), + *(smem->out + look[tid][2]), + *(smem->out + look[tid][3]), + ccnt[tid], vs, smem->va + tid, + smem->out + tid + ); + } + mem_fence(CLK_LOCAL_MEM_FENCE); + + int outXor = ((__local int*)smem->out)[block]; + for(uint dd = block + 4; dd < (tidd + 1) * 16; dd += 4) + outXor ^= ((__local int*)smem->out)[dd]; + + ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm] = outXor ^ tmp; + ((__local int*)smem->out)[tid] = outXor; + + float va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4]; + float va_tmp2 = ((__local float*)smem->va)[block+ 8] + ((__local float*)smem->va)[block + 12]; + ((__local float*)smem->va)[tid] = va_tmp1 + va_tmp2; + + mem_fence(CLK_LOCAL_MEM_FENCE); + + int out2 = ((__local int*)smem->out)[tid] ^ ((__local int*)smem->out)[tid + 4 ] ^ ((__local int*)smem->out)[tid + 8] ^ ((__local int*)smem->out)[tid + 12]; + va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4]; + va_tmp2 = ((__local float*)smem->va)[block + 8] + ((__local float*)smem->va)[block + 12]; + va_tmp1 = va_tmp1 + va_tmp2; + va_tmp1 = fabs(va_tmp1); + + float xx = va_tmp1 * 16777216.0f; + int xx_int = (int)xx; + ((__local int*)smem->out)[tid] = out2 ^ xx_int; + ((__local float*)smem->va)[tid] = va_tmp1 / 64.0f; + + mem_fence(CLK_LOCAL_MEM_FENCE); + + vs = smem->va[0]; + s = smem->out[0].x ^ smem->out[0].y ^ smem->out[0].z ^ smem->out[0].w; + } +} + +)===" +R"===( + +static const __constant uint skip[3] = { + 20,22,22 +}; + +inline void generate_512(uint idx, __local ulong* in, __global ulong* out) +{ + ulong hash[25]; + + hash[0] = in[0] ^ idx; + for(int i = 1; i < 25; ++i) + hash[i] = in[i]; + + for(int a = 0; a < 3;++a) + { + keccakf1600_1(hash); + for(int i = 0; i < skip[a]; ++i) + out[i] = hash[i]; + out+=skip[a]; + } +} + +__attribute__((reqd_work_group_size(8, 8, 1))) +__kernel void JOIN(cn0_cn_gpu,ALGO)(__global ulong *input, __global int *Scratchpad, __global ulong *states, uint Threads) +{ + const uint gIdx = getIdx(); + __local ulong State_buf[8 * 25]; + __local ulong* State = State_buf + get_local_id(0) * 25; + +#if(COMP_MODE==1) + // do not use early return here + if(gIdx < Threads) +#endif + { + states += 25 * gIdx; + +#if(STRIDED_INDEX==0) + Scratchpad = (__global int*)((__global char*)Scratchpad + MEMORY * gIdx); +#endif + + if (get_local_id(1) == 0) + { + +// NVIDIA +#ifdef __NV_CL_C_VERSION + for(uint i = 0; i < 8; ++i) + State[i] = input[i]; +#else + ((__local ulong8 *)State)[0] = vload8(0, input); +#endif + State[8] = input[8]; + State[9] = input[9]; + State[10] = input[10]; + + ((__local uint *)State)[9] &= 0x00FFFFFFU; + ((__local uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24; + ((__local uint *)State)[10] &= 0xFF000000U; + /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA) + * handle get_global_id and get_global_offset as signed long long int and add + * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1` + * (even if it is correct casted to unsigned on the host) + */ + ((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8)); + + for (int i = 11; i < 25; ++i) { + State[i] = 0x00UL; + } + + // Last bit of padding + State[16] = 0x8000000000000000UL; + + keccakf1600_2(State); + + #pragma unroll + for (int i = 0; i < 25; ++i) { + states[i] = State[i]; + } + } + } +} + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void JOIN(cn00_cn_gpu,ALGO)(__global int *Scratchpad, __global ulong *states) +{ + const uint gIdx = getIdx() / 64; + __local ulong State[25]; + + states += 25 * gIdx; + +#if(STRIDED_INDEX==0) + Scratchpad = (__global int*)((__global char*)Scratchpad + MEMORY * gIdx); +#endif + + for(int i = get_local_id(0); i < 25; i+=get_local_size(0)) + State[i] = states[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + + for(uint i = get_local_id(0); i < MEMORY / 512; i += get_local_size(0)) + { + generate_512(i, State, (__global ulong*)((__global uchar*)Scratchpad + i*512)); + } +} + +)===" diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl new file mode 100644 index 000000000..9edb774ad --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl @@ -0,0 +1,220 @@ +R"===( +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#define cryptonight_r_wow 15 +#define cryptonight_r 16 + +#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT) + +#if(STRIDED_INDEX==0) +# define IDX(x) (x) +#elif(STRIDED_INDEX==1) +# define IDX(x) (mul24(((uint)(x)), Threads)) +#elif(STRIDED_INDEX==2) +# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) +#elif(STRIDED_INDEX==3) +# define IDX(x) ((x) * WORKSIZE) +#endif + +// __NV_CL_C_VERSION checks if NVIDIA opencl is used +#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION)) +# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4)))) +# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) +#else +# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)]) +#endif + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *states, uint Threads) +{ + ulong a[2], b[4]; + __local uint AES0[256], AES1[256], AES2[256], AES3[256]; + +#ifdef __NV_CL_C_VERSION + __local uint16 scratchpad_line_buf[WORKSIZE]; + __local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0); +#endif + + const ulong gIdx = get_global_id(0) - get_global_offset(0); + + for(int i = get_local_id(0); i < 256; i += WORKSIZE) + { + const uint tmp = AES0_C[i]; + AES0[i] = tmp; + AES1[i] = rotate(tmp, 8U); + AES2[i] = rotate(tmp, 16U); + AES3[i] = rotate(tmp, 24U); + } + + barrier(CLK_LOCAL_MEM_FENCE); + +# if (COMP_MODE == 1) + // do not use early return here + if (gIdx < Threads) +# endif + { + states += 25 * gIdx; + +#if(STRIDED_INDEX==0) + Scratchpad += gIdx * (MEMORY >> 4); +#elif(STRIDED_INDEX==1) + Scratchpad += gIdx; +#elif(STRIDED_INDEX==2) + Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0); +#elif(STRIDED_INDEX==3) + Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE); +#endif + + a[0] = states[0] ^ states[4]; + a[1] = states[1] ^ states[5]; + + b[0] = states[2] ^ states[6]; + b[1] = states[3] ^ states[7]; + b[2] = states[8] ^ states[10]; + b[3] = states[9] ^ states[11]; + } + + ulong2 bx0 = ((ulong2 *)b)[0]; + ulong2 bx1 = ((ulong2 *)b)[1]; + + mem_fence(CLK_LOCAL_MEM_FENCE); + +# if (COMP_MODE == 1) + // do not use early return here + if (gIdx < Threads) +# endif + { + + uint r0 = as_uint2(states[12]).s0; + uint r1 = as_uint2(states[12]).s1; + uint r2 = as_uint2(states[13]).s0; + uint r3 = as_uint2(states[13]).s1; + + #pragma unroll CN_UNROLL + for(int i = 0; i < ITERATIONS; ++i) + { +# ifdef __NV_CL_C_VERSION + uint idx = a[0] & 0x1FFFC0; + uint idx1 = a[0] & 0x30; + + *scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx); +# else + uint idx = a[0] & MASK; +# endif + +#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION)) + *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; +#endif + uint4 c = SCRATCHPAD_CHUNK(0); + c = AES_Round(AES0, AES1, AES2, AES3, c, ((uint4 *)a)[0]); + + { + const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + +#if (ALGO == cryptonight_r) + c ^= as_uint4(chunk1) ^ as_uint4(chunk2) ^ as_uint4(chunk3); +#endif + + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } + + SCRATCHPAD_CHUNK(0) = as_uint4(bx0) ^ c; + +# ifdef __NV_CL_C_VERSION + *(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line; + + idx = as_ulong2(c).s0 & 0x1FFFC0; + idx1 = as_ulong2(c).s0 & 0x30; + + *scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx); +# else + idx = as_ulong2(c).s0 & MASK; +# endif + + uint4 tmp = SCRATCHPAD_CHUNK(0); + + tmp.s0 ^= r0 + r1; + tmp.s1 ^= r2 + r3; + const uint r4 = as_uint2(a[0]).s0; + const uint r5 = as_uint2(a[1]).s0; + const uint r6 = as_uint4(bx0).s0; + const uint r7 = as_uint4(bx1).s0; +#if (ALGO == cryptonight_r) + const uint r8 = as_uint4(bx1).s2; +#endif +#define ROT_BITS 32 + + XMRSTAK_INCLUDE_RANDOM_MATH + +#if (ALGO == cryptonight_r) + + const uint2 al = (uint2)(as_uint2(a[0]).s0 ^ r2, as_uint2(a[0]).s1 ^ r3); + const uint2 ah = (uint2)(as_uint2(a[1]).s0 ^ r0, as_uint2(a[1]).s1 ^ r1); +#endif + + ulong2 t; + t.s0 = mul_hi(as_ulong2(c).s0, as_ulong2(tmp).s0); + t.s1 = as_ulong2(c).s0 * as_ulong2(tmp).s0; + { + const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) +#if (ALGO == cryptonight_r_wow) + ^ t +#endif + ; + const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); +#if (ALGO == cryptonight_r_wow) + t ^= chunk2; +#endif + const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + +#if (ALGO == cryptonight_r) + c ^= as_uint4(chunk1) ^ as_uint4(chunk2) ^ as_uint4(chunk3); +#endif + + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } + +#if (ALGO == cryptonight_r) + a[1] = as_ulong(ah) + t.s1; + a[0] = as_ulong(al) + t.s0; +#else + a[1] += t.s1; + a[0] += t.s0; +#endif + + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; + +# ifdef __NV_CL_C_VERSION + *(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line; +# endif + + ((uint4 *)a)[0] ^= tmp; + bx1 = bx0; + bx0 = as_ulong2(c); + } + +# undef SCRATCHPAD_CHUNK + } + mem_fence(CLK_GLOBAL_MEM_FENCE); +} +)===" diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl index 161f2f55d..4469b0670 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl @@ -1,7 +1,6 @@ R"===( -#ifndef FAST_DIV_HEAVY_CL -#define FAST_DIV_HEAVY_CL +#if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) inline long fast_div_heavy(long _a, int _b) { long a = abs(_a); @@ -19,6 +18,5 @@ inline long fast_div_heavy(long _a, int _b) const long q = q1 + q2 + q3; return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q; } - #endif )===" diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl index c170387b4..8878db618 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -3,8 +3,7 @@ R"===( * @author SChernykh */ -// cryptonight_monero_v8 -#if(ALGO==11) +#if(ALGO == cryptonight_monero_v8) static const __constant uint RCP_C[256] = { diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index ba4cebb7b..ea688e053 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -83,10 +83,13 @@ class autoAdjust constexpr size_t byteToMiB = 1024u * 1024u; - size_t hashMemSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + size_t hashMemSize = 0; + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, algo.Mem()); + } std::string conf; for(auto& ctx : devVec) @@ -128,18 +131,17 @@ class autoAdjust } // check if cryptonight_monero_v8 is selected for the user or dev pool - bool useCryptonight_v8 = - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()); // true for all cryptonight_heavy derivates since we check the user and dev pool - bool useCryptonight_heavy = - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_heavy || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_heavy || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_heavy || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_heavy; + bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end(); + + // true for cryptonight_gpu as main user pool algorithm + bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu; + + bool useCryptonight_r = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r; + + bool useCryptonight_r_wow = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r_wow; // set strided index to default ctx.stridedIndex = 1; @@ -149,14 +151,28 @@ class autoAdjust ctx.stridedIndex = 0; // use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2` - if(useCryptonight_v8) + if(useCryptonight_v8 || useCryptonight_r || useCryptonight_r_wow) ctx.stridedIndex = 2; else if(useCryptonight_heavy) ctx.stridedIndex = 3; - // increase all intensity limits by two for aeon - if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite) - maxThreads *= 2u; + if(hashMemSize < CN_MEMORY) + { + size_t factor = CN_MEMORY / hashMemSize; + // increase all intensity relative to the original scratchpad size + maxThreads *= factor; + } + + uint32_t numUnroll = 8; + + if(useCryptonight_gpu) + { + // 6 waves per compute unit are a good value (based on profiling) + // @todo check again after all optimizations + maxThreads = ctx.computeUnits * 6 * 8; + ctx.stridedIndex = 0; + numUnroll = 1; + } // keep 128MiB memory free (value is randomly chosen) from the max available memory const size_t maxAvailableFreeMem = ctx.freeMem - minFreeMem; @@ -164,7 +180,7 @@ class autoAdjust size_t memPerThread = std::min(ctx.maxMemPerAlloc, maxAvailableFreeMem); uint32_t numThreads = 1u; - if(ctx.isAMD) + if(ctx.isAMD && !useCryptonight_gpu) { numThreads = 2; size_t memDoubleThread = maxAvailableFreeMem / numThreads; @@ -199,7 +215,7 @@ class autoAdjust conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" - " \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" + + " \"unroll\" : " + std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" + " },\n"; } } diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index b0f4e6ecd..eb0009413 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -183,7 +183,11 @@ void minethd::work_main() } // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); + + cpu::minethd::cn_on_new_job set_job; + + cn_hash_fun hash_fun; + cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); uint8_t version = 0; size_t lastPoolId = 0; @@ -224,23 +228,26 @@ void minethd::work_main() if(new_version >= coinDesc.GetMiningForkVersion()) { miner_algo = coinDesc.GetMiningAlgo(); - hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); + cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); } else { miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); + cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); } lastPoolId = oWork.iPoolId; version = new_version; } + if(set_job != nullptr) + set_job(oWork, &cpu_ctx); + size_t round_ctr = 0; assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); uint64_t target = oWork.iTarget; - XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo); + XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo, cpu_ctx->cn_r_ctx.height); if(oWork.bNiceHash) pGpuCtx->Nonce = *(uint32_t*)(oWork.bWorkBlob + 39); @@ -275,7 +282,7 @@ void minethd::work_main() *(uint32_t*)(bWorkBlob + 39) = results[i]; - hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx); + hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo); if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId)); else @@ -327,7 +334,7 @@ void minethd::work_main() ); } // update gpu with new intensity - XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo); + XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo, cpu_ctx->cn_r_ctx.height); } // use 3 rounds to warm up with the new intensity else if(cntTestRounds == autoTune + 3) diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp index 74ab5fb60..402d63cd6 100644 --- a/xmrstak/backend/amd/minethd.hpp +++ b/xmrstak/backend/amd/minethd.hpp @@ -24,7 +24,7 @@ class minethd : public iBackend static bool init_gpus(); private: - typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**); + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&); minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg); diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index e7f3e9148..ba0e6984f 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -28,11 +28,15 @@ class autoAdjust bool printConfig() { + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + size_t hashMemSize = 0; + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, algo.Mem()); + } + const size_t hashMemSizeKB = hashMemSize / 1024u; - const size_t hashMemSizeKB = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ) / 1024u; const size_t halfHashMemSizeKB = hashMemSizeKB / 2u; configEditor configTpl{}; @@ -45,7 +49,14 @@ class autoAdjust std::string conf; + // if cryptonight_gpu is used we will disable cpu mining but provide a inactive config + bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu; + if(useCryptonight_gpu) + { + printer::inst()->print_msg(L0, "WARNING: CPU mining will be disabled because cryptonight_gpu is not suitable for CPU mining. You can uncomment the auto generated config in %s to enable CPU mining.", params::inst().configFileCPU.c_str()); + conf += "/*\n//CPU config is disabled by default because cryptonight_gpu is not suitable for CPU mining.\n"; + } if(!detectL3Size() || L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048u)) { if(L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048)) @@ -96,6 +107,9 @@ class autoAdjust } } + if(useCryptonight_gpu) + conf += "*/\n"; + configTpl.replace("CPUCONFIG",conf); configTpl.write(params::inst().configFileCPU); printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str()); diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index b61582588..f09b1ebc0 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -28,10 +28,12 @@ class autoAdjust autoAdjust() { - hashMemSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, algo.Mem()); + } halfHashMemSize = hashMemSize / 2u; } @@ -51,6 +53,15 @@ class autoAdjust ; configTpl.set( std::string(tpl) ); + // if cryptonight_gpu is used we will disable cpu mining but provide a inactive config + bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu; + + if(useCryptonight_gpu) + { + printer::inst()->print_msg(L0, "WARNING: CPU mining will be disabled because cryptonight_gpu is not suitable for CPU mining. You can uncomment the auto generated config in %s to enable CPU mining.", params::inst().configFileCPU.c_str()); + conf += "/*\n//CPU config is disabled by default because cryptonight_gpu is not suitable for CPU mining.\n"; + } + try { std::vector tlcs; @@ -83,6 +94,9 @@ class autoAdjust printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what()); } + if(useCryptonight_gpu) + conf += "*/\n"; + configTpl.replace("CPUCONFIG",conf); configTpl.write(params::inst().configFileCPU); printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str()); @@ -93,8 +107,8 @@ class autoAdjust } private: - size_t hashMemSize; - size_t halfHashMemSize; + size_t hashMemSize = 0; + size_t halfHashMemSize = 0; std::vector results; diff --git a/xmrstak/backend/cpu/crypto/cn_gpu.hpp b/xmrstak/backend/cpu/crypto/cn_gpu.hpp new file mode 100644 index 000000000..5844d3814 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/cn_gpu.hpp @@ -0,0 +1,42 @@ +#pragma once + +#include "xmrstak/backend/cryptonight.hpp" +#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#include +#define HAS_WIN_INTRIN_API +#endif + +#ifdef __GNUC__ +#include +#if !defined(HAS_WIN_INTRIN_API) +#include +#endif // !defined(HAS_WIN_INTRIN_API) +#endif // __GNUC__ + +inline void cngpu_cpuid(uint32_t eax, int32_t ecx, int32_t val[4]) +{ + val[0] = 0; + val[1] = 0; + val[2] = 0; + val[3] = 0; + +#if defined(HAS_WIN_INTRIN_API) + __cpuidex(val, eax, ecx); +#else + __cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]); +#endif +} + +inline bool cngpu_check_avx2() +{ + int32_t cpu_info[4]; + cngpu_cpuid(7, 0, cpu_info); + return (cpu_info[1] & (1 << 5)) != 0; +} + +void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo); + +void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo); diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp new file mode 100644 index 000000000..8b4aefe13 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp @@ -0,0 +1,177 @@ +#include "cn_gpu.hpp" +#include "../../cryptonight.hpp" + +#pragma GCC target ("avx2") + +inline void prep_dv_avx(__m256i* idx, __m256i& v, __m256& n01) +{ + v = _mm256_load_si256(idx); + n01 = _mm256_cvtepi32_ps(v); +} + +inline __m256 fma_break(const __m256& x) +{ + // Break the dependency chain by setitng the exp to ?????01 + __m256 xx = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFEFFFFFF)), x); + return _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x00800000)), xx); +} + +// 14 +inline void sub_round(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& n, __m256& d, __m256& c) +{ + __m256 nn = _mm256_mul_ps(n0, c); + nn = _mm256_mul_ps(_mm256_add_ps(n1, c), _mm256_mul_ps(nn, nn)); + nn = fma_break(nn); + n = _mm256_add_ps(n, nn); + + __m256 dd = _mm256_mul_ps(n2, c); + dd = _mm256_mul_ps(_mm256_sub_ps(n3, c), _mm256_mul_ps(dd, dd)); + dd = fma_break(dd); + d = _mm256_add_ps(d, dd); + + //Constant feedback + c = _mm256_add_ps(c, rnd_c); + c = _mm256_add_ps(c, _mm256_set1_ps(0.734375f)); + __m256 r = _mm256_add_ps(nn, dd); + r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r); + r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r); + c = _mm256_add_ps(c, r); +} + +// 14*8 + 2 = 112 +inline void round_compute(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& c, __m256& r) +{ + __m256 n = _mm256_setzero_ps(), d = _mm256_setzero_ps(); + + sub_round(n0, n1, n2, n3, rnd_c, n, d, c); + sub_round(n1, n2, n3, n0, rnd_c, n, d, c); + sub_round(n2, n3, n0, n1, rnd_c, n, d, c); + sub_round(n3, n0, n1, n2, rnd_c, n, d, c); + sub_round(n3, n2, n1, n0, rnd_c, n, d, c); + sub_round(n2, n1, n0, n3, rnd_c, n, d, c); + sub_round(n1, n0, n3, n2, rnd_c, n, d, c); + sub_round(n0, n3, n2, n1, rnd_c, n, d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFF7FFFFF)), d); + d = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), d); + r = _mm256_add_ps(r, _mm256_div_ps(n, d)); +} + +// 112×4 = 448 +template +inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, + float lcnt, float hcnt, const __m256& rnd_c, __m256& sum) +{ + __m256 c = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_set1_ps(lcnt)), _mm_set1_ps(hcnt), 1); + __m256 r = _mm256_setzero_ps(); + + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + + // do a quick fmod by setting exp to 2 + r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r); + r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r); + + if(add) + sum = _mm256_add_ps(sum, r); + else + sum = r; + + r = _mm256_mul_ps(r, _mm256_set1_ps(536870880.0f)); // 35 + return _mm256_cvttps_epi32(r); +} + +template +inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, + float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out) +{ + __m256i r = double_comupte(n0, n1, n2, n3, lcnt, hcnt, rnd_c, sum); + if(rot != 0) + r = _mm256_or_si256(_mm256_bslli_epi128(r, 16 - rot), _mm256_bsrli_epi128(r, rot)); + + out = _mm256_xor_si256(out, r); +} + + +inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m256i*>(lpad + (idx & mask) + n*16); } + + +void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo) +{ + const uint32_t ITER = algo.Iter(); + const uint32_t mask = algo.Mask(); + + uint32_t s = reinterpret_cast(spad)[0] >> 8; + __m256i* idx0 = scratchpad_ptr(lpad, s, 0, mask); + __m256i* idx2 = scratchpad_ptr(lpad, s, 2, mask); + __m256 sum0 = _mm256_setzero_ps(); + + for(size_t i = 0; i < ITER; i++) + { + __m256i v01, v23; + __m256 suma, sumb, sum1; + __m256 rc = sum0; + + __m256 n01, n23; + __m256 d01, d23; + prep_dv_avx(idx0, v01, n01); + prep_dv_avx(idx2, v23, n23); + + __m256i out, out2; + __m256 n10, n22, n33; + n10 = _mm256_permute2f128_ps(n01, n01, 0x01); + n22 = _mm256_permute2f128_ps(n23, n23, 0x00); + n33 = _mm256_permute2f128_ps(n23, n23, 0x11); + + out = _mm256_setzero_si256(); + double_comupte_wrap<0>(n01, n10, n22, n33, 1.3437500f, 1.4296875f, rc, suma, out); + double_comupte_wrap<1>(n01, n22, n33, n10, 1.2812500f, 1.3984375f, rc, suma, out); + double_comupte_wrap<2>(n01, n33, n10, n22, 1.3593750f, 1.3828125f, rc, sumb, out); + double_comupte_wrap<3>(n01, n33, n22, n10, 1.3671875f, 1.3046875f, rc, sumb, out); + _mm256_store_si256(idx0, _mm256_xor_si256(v01, out)); + sum0 = _mm256_add_ps(suma, sumb); + out2 = out; + + __m256 n11, n02, n30; + n11 = _mm256_permute2f128_ps(n01, n01, 0x11); + n02 = _mm256_permute2f128_ps(n01, n23, 0x20); + n30 = _mm256_permute2f128_ps(n01, n23, 0x03); + + out = _mm256_setzero_si256(); + double_comupte_wrap<0>(n23, n11, n02, n30, 1.4140625f, 1.3203125f, rc, suma, out); + double_comupte_wrap<1>(n23, n02, n30, n11, 1.2734375f, 1.3515625f, rc, suma, out); + double_comupte_wrap<2>(n23, n30, n11, n02, 1.2578125f, 1.3359375f, rc, sumb, out); + double_comupte_wrap<3>(n23, n30, n02, n11, 1.2890625f, 1.4609375f, rc, sumb, out); + _mm256_store_si256(idx2, _mm256_xor_si256(v23, out)); + sum1 = _mm256_add_ps(suma, sumb); + + out2 = _mm256_xor_si256(out2, out); + out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2,out2,0x41), out2); + suma = _mm256_permute2f128_ps(sum0, sum1, 0x30); + sumb = _mm256_permute2f128_ps(sum0, sum1, 0x21); + sum0 = _mm256_add_ps(suma, sumb); + sum0 = _mm256_add_ps(sum0, _mm256_permute2f128_ps(sum0, sum0, 0x41)); + + // Clear the high 128 bits + __m128 sum = _mm256_castps256_ps128(sum0); + + sum = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum); // take abs(va) by masking the float sign bit + // vs range 0 - 64 + __m128i v0 = _mm_cvttps_epi32(_mm_mul_ps(sum, _mm_set1_ps(16777216.0f))); + v0 = _mm_xor_si128(v0, _mm256_castsi256_si128(out2)); + __m128i v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3)); + v0 = _mm_xor_si128(v0, v1); + v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1)); + v0 = _mm_xor_si128(v0, v1); + + // vs is now between 0 and 1 + sum = _mm_div_ps(sum, _mm_set1_ps(64.0f)); + sum0 = _mm256_insertf128_ps(_mm256_castps128_ps256(sum), sum, 1); + uint32_t n = _mm_cvtsi128_si32(v0); + idx0 = scratchpad_ptr(lpad, n, 0, mask); + idx2 = scratchpad_ptr(lpad, n, 2, mask); + } +} diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp new file mode 100644 index 000000000..c8627d8b8 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp @@ -0,0 +1,181 @@ +#include "cn_gpu.hpp" +#include "../../cryptonight.hpp" + +#pragma GCC target ("sse2") + +inline void prep_dv(__m128i* idx, __m128i& v, __m128& n) +{ + v = _mm_load_si128(idx); + n = _mm_cvtepi32_ps(v); +} + +inline __m128 fma_break(__m128 x) +{ + // Break the dependency chain by setitng the exp to ?????01 + x = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFEFFFFFF)), x); + return _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x00800000)), x); +} + +// 14 +inline void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c) +{ + n1 = _mm_add_ps(n1, c); + __m128 nn = _mm_mul_ps(n0, c); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = fma_break(nn); + n = _mm_add_ps(n, nn); + + n3 = _mm_sub_ps(n3, c); + __m128 dd = _mm_mul_ps(n2, c); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = fma_break(dd); + d = _mm_add_ps(d, dd); + + //Constant feedback + c = _mm_add_ps(c, rnd_c); + c = _mm_add_ps(c, _mm_set1_ps(0.734375f)); + __m128 r = _mm_add_ps(nn, dd); + r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r); + r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r); + c = _mm_add_ps(c, r); +} + +// 14*8 + 2 = 112 +inline void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r) +{ + __m128 n = _mm_setzero_ps(), d = _mm_setzero_ps(); + + sub_round(n0, n1, n2, n3, rnd_c, n, d, c); + sub_round(n1, n2, n3, n0, rnd_c, n, d, c); + sub_round(n2, n3, n0, n1, rnd_c, n, d, c); + sub_round(n3, n0, n1, n2, rnd_c, n, d, c); + sub_round(n3, n2, n1, n0, rnd_c, n, d, c); + sub_round(n2, n1, n0, n3, rnd_c, n, d, c); + sub_round(n1, n0, n3, n2, rnd_c, n, d, c); + sub_round(n0, n3, n2, n1, rnd_c, n, d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFF7FFFFF)), d); + d = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), d); + r =_mm_add_ps(r, _mm_div_ps(n,d)); +} + +// 112×4 = 448 +template +inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) +{ + __m128 c = _mm_set1_ps(cnt); + __m128 r = _mm_setzero_ps(); + + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + round_compute(n0, n1, n2, n3, rnd_c, c, r); + + // do a quick fmod by setting exp to 2 + r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r); + r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r); + + if(add) + sum = _mm_add_ps(sum, r); + else + sum = r; + + r = _mm_mul_ps(r, _mm_set1_ps(536870880.0f)); // 35 + return _mm_cvttps_epi32(r); +} + +template +inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) +{ + __m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); + if(rot != 0) + r = _mm_or_si128(_mm_slli_si128(r, 16 - rot), _mm_srli_si128(r, rot)); + out = _mm_xor_si128(out, r); +} + +inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m128i*>(lpad + (idx & mask) + n*16); } + +void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo) +{ + const uint32_t ITER = algo.Iter(); + const uint32_t mask = algo.Mask(); + + uint32_t s = reinterpret_cast(spad)[0] >> 8; + __m128i* idx0 = scratchpad_ptr(lpad, s, 0, mask); + __m128i* idx1 = scratchpad_ptr(lpad, s, 1, mask); + __m128i* idx2 = scratchpad_ptr(lpad, s, 2, mask); + __m128i* idx3 = scratchpad_ptr(lpad, s, 3, mask); + __m128 sum0 = _mm_setzero_ps(); + + for(size_t i = 0; i < ITER; i++) + { + __m128 n0, n1, n2, n3; + __m128i v0, v1, v2, v3; + __m128 suma, sumb, sum1, sum2, sum3; + + prep_dv(idx0, v0, n0); + prep_dv(idx1, v1, n1); + prep_dv(idx2, v2, n2); + prep_dv(idx3, v3, n3); + __m128 rc = sum0; + + __m128i out, out2; + out = _mm_setzero_si128(); + single_comupte_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out); + single_comupte_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out); + single_comupte_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out); + single_comupte_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out); + sum0 = _mm_add_ps(suma, sumb); + _mm_store_si128(idx0, _mm_xor_si128(v0, out)); + out2 = out; + + out = _mm_setzero_si128(); + single_comupte_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out); + single_comupte_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out); + single_comupte_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out); + single_comupte_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out); + sum1 = _mm_add_ps(suma, sumb); + _mm_store_si128(idx1, _mm_xor_si128(v1, out)); + out2 = _mm_xor_si128(out2, out); + + out = _mm_setzero_si128(); + single_comupte_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out); + single_comupte_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out); + single_comupte_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out); + single_comupte_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out); + sum2 = _mm_add_ps(suma, sumb); + _mm_store_si128(idx2, _mm_xor_si128(v2, out)); + out2 = _mm_xor_si128(out2, out); + + out = _mm_setzero_si128(); + single_comupte_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out); + single_comupte_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out); + single_comupte_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out); + single_comupte_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out); + sum3 = _mm_add_ps(suma, sumb); + _mm_store_si128(idx3, _mm_xor_si128(v3, out)); + out2 = _mm_xor_si128(out2, out); + sum0 = _mm_add_ps(sum0, sum1); + sum2 = _mm_add_ps(sum2, sum3); + sum0 = _mm_add_ps(sum0, sum2); + + sum0 = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum0); // take abs(va) by masking the float sign bit + // vs range 0 - 64 + n0 = _mm_mul_ps(sum0, _mm_set1_ps(16777216.0f)); + v0 = _mm_cvttps_epi32(n0); + v0 = _mm_xor_si128(v0, out2); + v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3)); + v0 = _mm_xor_si128(v0, v1); + v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1)); + v0 = _mm_xor_si128(v0, v1); + + // vs is now between 0 and 1 + sum0 = _mm_div_ps(sum0, _mm_set1_ps(64.0f)); + uint32_t n = _mm_cvtsi128_si32(v0); + idx0 = scratchpad_ptr(lpad, n, 0, mask); + idx1 = scratchpad_ptr(lpad, n, 1, mask); + idx2 = scratchpad_ptr(lpad, n, 2, mask); + idx3 = scratchpad_ptr(lpad, n, 3, mask); + } +} diff --git a/xmrstak/backend/cpu/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h index 5c9a73332..a7c77cdac 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight.h +++ b/xmrstak/backend/cpu/crypto/cryptonight.h @@ -1,29 +1,31 @@ -#ifndef __CRYPTONIGHT_H_INCLUDED -#define __CRYPTONIGHT_H_INCLUDED - -#ifdef __cplusplus -extern "C" { -#endif - +#pragma once #include #include -typedef struct { +#include "variant4_random_math.h" + +struct extra_ctx_r +{ + uint64_t height = 0; + // the buffer must be able to hold NUM_INSTRUCTIONS_MAX and a termination instruction + V4_Instruction code[NUM_INSTRUCTIONS_MAX + 1]; +}; + +struct cryptonight_ctx +{ uint8_t hash_state[224]; // Need only 200, explicit align uint8_t* long_state; uint8_t ctx_info[24]; //Use some of the extra memory for flags -} cryptonight_ctx; + extra_ctx_r cn_r_ctx; +}; -typedef struct { +struct alloc_msg +{ const char* warning; -} alloc_msg; +}; size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg); cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg); void cryptonight_free_ctx(cryptonight_ctx* ctx); -#ifdef __cplusplus -} -#endif -#endif diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 06cbe8740..43f719873 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -17,6 +17,8 @@ #include "cryptonight.h" #include "xmrstak/backend/cryptonight.hpp" +#include "../../miner_work.hpp" +#include "cn_gpu.hpp" #include #include #include @@ -164,9 +166,11 @@ inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3 x7 = _mm_xor_si128(x7, tmp0); } -template -void cn_explode_scratchpad(const __m128i* input, __m128i* output) +template +void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo) { + constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast; + // This is more than we have registers, compiler will assign 2 keys on the stack __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7; __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -182,7 +186,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output) xin6 = _mm_load_si128(input + 10); xin7 = _mm_load_si128(input + 11); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(HEAVY_MIX) { for(size_t i=0; i < 16; i++) { @@ -216,6 +220,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output) } } + const size_t MEM = algo.Mem(); for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { if(SOFT_AES) @@ -263,9 +268,46 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output) } } -template -void cn_implode_scratchpad(const __m128i* input, __m128i* output) +template +void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output, const xmrstak_algo& algo) +{ + constexpr size_t hash_size = 200; // 25x8 bytes + alignas(128) uint64_t hash[25]; + const size_t mem = algo.Mem(); + + for (uint64_t i = 0; i < mem / 512; i++) + { + memcpy(hash, input, hash_size); + hash[0] ^= i; + + keccakf(hash, 24); + memcpy(output, hash, 160); + output+=160; + + keccakf(hash, 24); + memcpy(output, hash, 176); + output+=176; + + keccakf(hash, 24); + memcpy(output, hash, 176); + output+=176; + + if(PREFETCH) + { + _mm_prefetch((const char*)output - 512, _MM_HINT_T2); + _mm_prefetch((const char*)output - 384, _MM_HINT_T2); + _mm_prefetch((const char*)output - 256, _MM_HINT_T2); + _mm_prefetch((const char*)output - 128, _MM_HINT_T2); + } + } +} + +template +void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo) { + constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu; + // This is more than we have registers, compiler will assign 2 keys on the stack __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -281,6 +323,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) xout6 = _mm_load_si128(output + 10); xout7 = _mm_load_si128(output + 11); + const size_t MEM = algo.Mem(); for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { if(PREFETCH) @@ -326,11 +369,11 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(HEAVY_MIX) mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(HEAVY_MIX) { for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { @@ -377,7 +420,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(HEAVY_MIX) mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } @@ -465,7 +508,7 @@ inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key) return _mm_load_si128((__m128i*)k); } -template +template inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) { mem_out[0] = _mm_cvtsi128_si64(tmp); @@ -543,9 +586,39 @@ inline void set_float_rounding_mode() #endif } -#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) \ +inline void set_float_rounding_mode_nearest() +{ +#ifdef _MSC_VER + _control87(RC_NEAR, MCW_RC); +#else + std::fesetround(FE_TONEAREST); +#endif +} + +inline __m128 _mm_set1_ps_epi32(uint32_t x) +{ + return _mm_castsi128_ps(_mm_set1_epi32(x)); +} + +inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) +{ + __m128 r = _mm_cvtepi32_ps(cx); + __m128 c_old = conc_var; + r = _mm_add_ps(r, conc_var); + r = _mm_mul_ps(r, _mm_mul_ps(r, r)); + r = _mm_and_ps(_mm_set1_ps_epi32(0x807FFFFF), r); + r = _mm_or_ps(_mm_set1_ps_epi32(0x40000000), r); + conc_var = _mm_add_ps(conc_var, r); + + c_old = _mm_and_ps(_mm_set1_ps_epi32(0x807FFFFF), c_old); + c_old = _mm_or_ps(_mm_set1_ps_epi32(0x40000000), c_old); + __m128 nc = _mm_mul_ps(c_old, _mm_set1_ps(536870880.0f)); + cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc)); +} + +#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx) \ /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ - if(ALGO == cryptonight_monero_v8) \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \ { \ const uint64_t idx1 = idx0 & MASK; \ const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \ @@ -554,11 +627,13 @@ inline void set_float_rounding_mode() _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + if (ALGO == cryptonight_r) \ + cx = _mm_xor_si128(_mm_xor_si128(cx, chunk3), _mm_xor_si128(chunk1, chunk2)); \ } #define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \ /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ - if(ALGO == cryptonight_monero_v8) \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r_wow) \ { \ const uint64_t idx1 = idx0 & MASK; \ const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ @@ -595,6 +670,23 @@ inline void set_float_rounding_mode() assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \ } +#define CN_R_RANDOM_MATH(n, al, ah, cl, bx0, bx1, cn_r_data) \ + if (ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \ + { \ + cl ^= (cn_r_data[0] + cn_r_data[1]) | ((uint64_t)(cn_r_data[2] + cn_r_data[3]) << 32); \ + cn_r_data[4] = static_cast(al); \ + cn_r_data[5] = static_cast(ah); \ + cn_r_data[6] = static_cast(_mm_cvtsi128_si32(bx0)); \ + cn_r_data[7] = static_cast(_mm_cvtsi128_si32(bx1)); \ + cn_r_data[8] = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \ + v4_random_math(ctx[n]->cn_r_ctx.code, cn_r_data); \ + } \ + if (ALGO == cryptonight_r) \ + { \ + al ^= cn_r_data[2] | ((uint64_t)(cn_r_data[3]) << 32); \ + ah ^= cn_r_data[0] | ((uint64_t)(cn_r_data[1]) << 32); \ + } + #define CN_INIT_SINGLE \ if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \ { \ @@ -602,7 +694,7 @@ inline void set_float_rounding_mode() return; \ } -#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm) \ +#define CN_INIT(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data) \ keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \ uint64_t monero_const; \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -611,7 +703,7 @@ inline void set_float_rounding_mode() monero_const ^= *(reinterpret_cast(ctx[n]->hash_state) + 24); \ } \ /* Optim - 99% time boundary */ \ - cn_explode_scratchpad((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state); \ + cn_explode_scratchpad((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state, algo); \ \ __m128i ax0; \ uint64_t idx0; \ @@ -620,7 +712,14 @@ inline void set_float_rounding_mode() /* BEGIN cryptonight_monero_v8 variables */ \ __m128i bx1; \ __m128i division_result_xmm; \ + __m128 conc_var; \ + if(ALGO == cryptonight_conceal || ALGO == cryptonight_gpu) \ + {\ + set_float_rounding_mode_nearest(); \ + conc_var = _mm_setzero_ps(); \ + }\ GetOptimalSqrtType_t sqrt_result; \ + uint32_t cn_r_data[9]; \ /* END cryptonight_monero_v8 variables */ \ { \ uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ @@ -634,13 +733,23 @@ inline void set_float_rounding_mode() assign(sqrt_result, h0[13]); \ set_float_rounding_mode(); \ } \ + if (ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \ + { \ + bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ + cn_r_data[0] = (uint32_t)(h0[12]); \ + cn_r_data[1] = (uint32_t)(h0[12] >> 32); \ + cn_r_data[2] = (uint32_t)(h0[13]); \ + cn_r_data[3] = (uint32_t)(h0[13] >> 32); \ + } \ } \ __m128i *ptr0 -#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1) \ +#define CN_STEP1(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1) \ __m128i cx; \ ptr0 = (__m128i *)&l0[idx0 & MASK]; \ cx = _mm_load_si128(ptr0); \ + if (ALGO == cryptonight_conceal) \ + cryptonight_conceal_tweak(cx, conc_var); \ if (ALGO == cryptonight_bittube2) \ { \ cx = aes_round_bittube2(cx, ax0); \ @@ -652,7 +761,7 @@ inline void set_float_rounding_mode() else \ cx = _mm_aesenc_si128(cx, ax0); \ } \ - CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) + CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx) #define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ @@ -664,24 +773,32 @@ inline void set_float_rounding_mode() ptr0 = (__m128i *)&l0[idx0 & MASK]; \ if(PREFETCH) \ _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ - if(ALGO != cryptonight_monero_v8) \ + if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_r && ALGO != cryptonight_r_wow) \ bx0 = cx -#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \ +#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data) \ uint64_t lo, cl, ch; \ uint64_t al0 = _mm_cvtsi128_si64(ax0); \ uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ cl = ((uint64_t*)ptr0)[0]; \ ch = ((uint64_t*)ptr0)[1]; \ + CN_R_RANDOM_MATH(n, al0, ah0, cl, bx0, bx1, cn_r_data); \ CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \ { \ uint64_t hi; \ lo = _umul128(idx0, cl, &hi); \ - CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \ + if(ALGO == cryptonight_r) \ + { \ + CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx); \ + } \ + else \ + { \ + CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \ + } \ ah0 += lo; \ al0 += hi; \ } \ - if(ALGO == cryptonight_monero_v8) \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO != cryptonight_r_wow) \ { \ bx1 = bx0; \ bx0 = cx; \ @@ -729,7 +846,7 @@ inline void set_float_rounding_mode() #define CN_FINALIZE(n) \ /* Optim - 90% time boundary */ \ - cn_implode_scratchpad((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state); \ + cn_implode_scratchpad((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state, algo); \ /* Optim - 99% time boundary */ \ keccakf((uint64_t*)ctx[n]->hash_state, 24); \ extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n) @@ -771,6 +888,7 @@ inline void set_float_rounding_mode() #define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n #define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n #define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n +#define CN_ENUM_16(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n, x16 ## n /** repeat a macro call multiple times * @@ -798,22 +916,22 @@ struct Cryptonight_hash<1> { static constexpr size_t N = 1; - template - static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + const uint32_t MASK = algo.Mask(); + const uint32_t ITERATIONS = algo.Iter(); + const size_t MEM = algo.Mem(); CN_INIT_SINGLE; - REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); + REPEAT_1(11, CN_INIT, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); + REPEAT_1(9, CN_STEP1, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); + REPEAT_1(16, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data); REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -827,22 +945,22 @@ struct Cryptonight_hash<2> { static constexpr size_t N = 2; - template - static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + const uint32_t MASK = algo.Mask(); + const uint32_t ITERATIONS = algo.Iter(); + const size_t MEM = algo.Mem(); CN_INIT_SINGLE; - REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); + REPEAT_2(11, CN_INIT, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); + REPEAT_2(9, CN_STEP1, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); + REPEAT_2(16, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data); REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -856,22 +974,22 @@ struct Cryptonight_hash<3> { static constexpr size_t N = 3; - template - static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + const uint32_t MASK = algo.Mask(); + const uint32_t ITERATIONS = algo.Iter(); + const size_t MEM = algo.Mem(); CN_INIT_SINGLE; - REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); + REPEAT_3(11, CN_INIT, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); + REPEAT_3(9, CN_STEP1, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); + REPEAT_3(16, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data); REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -885,22 +1003,22 @@ struct Cryptonight_hash<4> { static constexpr size_t N = 4; - template - static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + const uint32_t MASK = algo.Mask(); + const uint32_t ITERATIONS = algo.Iter(); + const size_t MEM = algo.Mem(); CN_INIT_SINGLE; - REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); + REPEAT_4(11, CN_INIT, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); + REPEAT_4(9, CN_STEP1, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); + REPEAT_4(16, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data); REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -914,22 +1032,22 @@ struct Cryptonight_hash<5> { static constexpr size_t N = 5; - template - static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { - constexpr size_t MASK = cn_select_mask(); - constexpr size_t ITERATIONS = cn_select_iter(); - constexpr size_t MEM = cn_select_memory(); + const uint32_t MASK = algo.Mask(); + const uint32_t ITERATIONS = algo.Iter(); + const size_t MEM = algo.Mem(); CN_INIT_SINGLE; - REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm); + REPEAT_5(11, CN_INIT, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data); // Optim - 90% time boundary for(size_t i = 0; i < ITERATIONS; i++) { - REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1); + REPEAT_5(9, CN_STEP1, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1); REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx); - REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm); + REPEAT_5(16, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data); REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0); REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0); } @@ -951,20 +1069,18 @@ struct Cryptonight_hash_asm<1, asm_version> { static constexpr size_t N = 1; - template - static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { - constexpr size_t MEM = cn_select_memory(); - keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); - cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state); + cn_explode_scratchpad((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state, algo); if(asm_version == 0) cryptonight_v8_mainloop_ivybridge_asm(ctx[0]); else if(asm_version == 1) cryptonight_v8_mainloop_ryzen_asm(ctx[0]); - cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state); + cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state, algo); keccakf((uint64_t*)ctx[0]->hash_state, 24); extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output); } @@ -976,16 +1092,16 @@ struct Cryptonight_hash_asm<2, 0> { static constexpr size_t N = 2; - template - static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx) + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { - constexpr size_t MEM = cn_select_memory(); + const size_t MEM = algo.Mem(); for(size_t i = 0; i < N; ++i) { keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); /* Optim - 99% time boundary */ - cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state); + cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state, algo); } cryptonight_v8_double_mainloop_sandybridge_asm(ctx[0], ctx[1]); @@ -993,10 +1109,48 @@ struct Cryptonight_hash_asm<2, 0> for(size_t i = 0; i < N; ++i) { /* Optim - 90% time boundary */ - cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state); + cn_implode_scratchpad((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state, algo); /* Optim - 99% time boundary */ keccakf((uint64_t*)ctx[i]->hash_state, 24); extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i); } } }; + +struct Cryptonight_hash_gpu +{ + static constexpr size_t N = 1; + + template + static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) + { + keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + cn_explode_scratchpad_gpu(ctx[0]->hash_state, ctx[0]->long_state, algo); + + if(cngpu_check_avx2()) + cn_gpu_inner_avx(ctx[0]->hash_state, ctx[0]->long_state, algo); + else + cn_gpu_inner_ssse3(ctx[0]->hash_state, ctx[0]->long_state, algo); + + cn_implode_scratchpad((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state, algo); + keccakf((uint64_t*)ctx[0]->hash_state, 24); + memcpy(output, ctx[0]->hash_state, 32); + } +}; + +template +struct Cryptonight_R_generator +{ + template + static void cn_on_new_job(const xmrstak::miner_work& work, cryptonight_ctx** ctx) + { + if(ctx[0]->cn_r_ctx.height == work.iBlockHeight) + return; + + ctx[0]->cn_r_ctx.height = work.iBlockHeight; + v4_random_math_init(ctx[0]->cn_r_ctx.code, work.iBlockHeight); + + for(size_t i=1; i < N; i++) + ctx[i]->cn_r_ctx = ctx[0]->cn_r_ctx; + } +}; diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp index a7e4696a8..a065abe01 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp +++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp @@ -203,10 +203,13 @@ size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg) cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg) { - size_t hashMemSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + size_t hashMemSize = 0; + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, algo.Mem()); + } cryptonight_ctx* ptr = (cryptonight_ctx*)_mm_malloc(sizeof(cryptonight_ctx), 4096); @@ -284,10 +287,13 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al void cryptonight_free_ctx(cryptonight_ctx* ctx) { - size_t hashMemSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + size_t hashMemSize = 0; + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, algo.Mem()); + } if(ctx->ctx_info[0] != 0) { diff --git a/xmrstak/backend/cpu/crypto/variant4_random_math.h b/xmrstak/backend/cpu/crypto/variant4_random_math.h new file mode 100644 index 000000000..07dd3cf61 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/variant4_random_math.h @@ -0,0 +1,451 @@ +#pragma once + +#include +#include "../../cryptonight.hpp" + +extern "C" +{ + #include "c_blake256.h" +} + +enum V4_Settings +{ + // Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications + TOTAL_LATENCY = 15 * 3, + + // Always generate at least 60 instructions + NUM_INSTRUCTIONS_MIN = 60, + + // Never generate more than 70 instructions (final RET instruction doesn't count here) + NUM_INSTRUCTIONS_MAX = 70, + + // Available ALUs for MUL + // Modern CPUs typically have only 1 ALU which can do multiplications + ALU_COUNT_MUL = 1, + + // Total available ALUs + // Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code + ALU_COUNT = 3, +}; + +enum V4_InstructionList +{ + MUL, // a*b + ADD, // a+b + C, C is an unsigned 32-bit constant + SUB, // a-b + ROR, // rotate right "a" by "b & 31" bits + ROL, // rotate left "a" by "b & 31" bits + XOR, // a^b + RET, // finish execution + V4_INSTRUCTION_COUNT = RET, +}; + +// V4_InstructionDefinition is used to generate code from random data +// Every random sequence of bytes is a valid code +// +// There are 9 registers in total: +// - 4 variable registers +// - 5 constant registers initialized from loop variables +// This is why dst_index is 2 bits +enum V4_InstructionDefinition +{ + V4_OPCODE_BITS = 3, + V4_DST_INDEX_BITS = 2, + V4_SRC_INDEX_BITS = 3, +}; + +struct V4_Instruction +{ + uint8_t opcode; + uint8_t dst_index; + uint8_t src_index; + uint32_t C; +}; + +#ifndef FORCEINLINE +#ifdef __GNUC__ +#define FORCEINLINE __attribute__((always_inline)) inline +#elif _MSC_VER +#define FORCEINLINE __forceinline +#else +#define FORCEINLINE inline +#endif +#endif + +#ifndef UNREACHABLE_CODE +#ifdef __GNUC__ +#define UNREACHABLE_CODE __builtin_unreachable() +#elif _MSC_VER +#define UNREACHABLE_CODE __assume(false) +#else +#define UNREACHABLE_CODE +#endif +#endif + +// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU: +// every switch-case will point to the same destination on every iteration of Cryptonight main loop +// +// This is about as fast as it can get without using low-level machine code generation +template +static void v4_random_math(const struct V4_Instruction* code, v4_reg* r) +{ + enum + { + REG_BITS = sizeof(v4_reg) * 8, + }; + +#define V4_EXEC(i) \ + { \ + const struct V4_Instruction* op = code + i; \ + const v4_reg src = r[op->src_index]; \ + v4_reg* dst = r + op->dst_index; \ + switch (op->opcode) \ + { \ + case MUL: \ + *dst *= src; \ + break; \ + case ADD: \ + *dst += src + op->C; \ + break; \ + case SUB: \ + *dst -= src; \ + break; \ + case ROR: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case ROL: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case XOR: \ + *dst ^= src; \ + break; \ + case RET: \ + return; \ + default: \ + UNREACHABLE_CODE; \ + break; \ + } \ + } + +#define V4_EXEC_10(j) \ + V4_EXEC(j + 0) \ + V4_EXEC(j + 1) \ + V4_EXEC(j + 2) \ + V4_EXEC(j + 3) \ + V4_EXEC(j + 4) \ + V4_EXEC(j + 5) \ + V4_EXEC(j + 6) \ + V4_EXEC(j + 7) \ + V4_EXEC(j + 8) \ + V4_EXEC(j + 9) + + // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency + // I've checked all block heights < 10,000,000 and here is the distribution of program sizes: + // + // 60 27960 + // 61 105054 + // 62 2452759 + // 63 5115997 + // 64 1022269 + // 65 1109635 + // 66 153145 + // 67 8550 + // 68 4529 + // 69 102 + + // Unroll 70 instructions here + V4_EXEC_10(0); // instructions 0-9 + V4_EXEC_10(10); // instructions 10-19 + V4_EXEC_10(20); // instructions 20-29 + V4_EXEC_10(30); // instructions 30-39 + V4_EXEC_10(40); // instructions 40-49 + V4_EXEC_10(50); // instructions 50-59 + V4_EXEC_10(60); // instructions 60-69 + +#undef V4_EXEC_10 +#undef V4_EXEC +} + +// If we don't have enough data available, generate more +static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size) +{ + if (*data_index + bytes_needed > data_size) + { + blake256_hash((uint8_t*)data, (uint8_t*)data, data_size); + *data_index = 0; + } +} + +#define SWAP32LE(x) x +#define SWAP64LE(x) x + +// Generates as many random math operations as possible with given latency and ALU restrictions +// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions +template +static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height) +{ + // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle + // These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake + // + // AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors + // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors + // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same + // Source: https://www.agner.org/optimize/instruction_tables.pdf + const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 }; + + // Instruction latencies for theoretical ASIC implementation + const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 }; + + // Available ALUs for each instruction + const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT }; + + int8_t data[32]; + memset(data, 0, sizeof(data)); + uint64_t tmp = SWAP64LE(height); + memcpy(data, &tmp, sizeof(uint64_t)); + if(ALGO == cryptonight_r) + { + data[20] = -38; + } + + // Set data_index past the last byte in data + // to trigger full data update with blake hash + // before we start using it + size_t data_index = sizeof(data); + + int code_size; + + // There is a small chance (1.8%) that register R8 won't be used in the generated program + // So we keep track of it and try again if it's not used + bool r8_used; + do { + int latency[9]; + int asic_latency[9]; + + // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution + // byte 0: current value of the destination register + // byte 1: instruction opcode + // byte 2: current value of the source register + // + // Registers R4-R8 are constant and are treated as having the same value because when we do + // the same operation twice with two constant source registers, it can be optimized into a single operation + uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; + + bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT]; + bool is_rotation[V4_INSTRUCTION_COUNT]; + bool rotated[4]; + int rotate_count = 0; + + memset(latency, 0, sizeof(latency)); + memset(asic_latency, 0, sizeof(asic_latency)); + memset(alu_busy, 0, sizeof(alu_busy)); + memset(is_rotation, 0, sizeof(is_rotation)); + memset(rotated, 0, sizeof(rotated)); + is_rotation[ROR] = true; + is_rotation[ROL] = true; + + int num_retries = 0; + code_size = 0; + + int total_iterations = 0; + r8_used = (ALGO == cryptonight_r_wow); + + // Generate random code to achieve minimal required latency for our abstract CPU + // Try to get this latency for all 4 registers + while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64)) + { + // Fail-safe to guarantee loop termination + ++total_iterations; + if (total_iterations > 256) + break; + + check_data(&data_index, 1, data, sizeof(data)); + + const uint8_t c = ((uint8_t*)data)[data_index++]; + + // MUL = opcodes 0-2 + // ADD = opcode 3 + // SUB = opcode 4 + // ROR/ROL = opcode 5, shift direction is selected randomly + // XOR = opcodes 6-7 + uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1); + if (opcode == 5) + { + check_data(&data_index, 1, data, sizeof(data)); + opcode = (data[data_index++] >= 0) ? ROR : ROL; + } + else if (opcode >= 6) + { + opcode = XOR; + } + else + { + opcode = (opcode <= 2) ? MUL : (opcode - 2); + } + + uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1); + uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1); + + const int a = dst_index; + int b = src_index; + + // Don't do ADD/SUB/XOR with the same register + if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) + { + // a is always < 4, so we don't need to check bounds here + b = (ALGO == cryptonight_r_wow) ? (a + 4) : 8; + src_index = b; + } + + // Don't do rotation with the same destination twice because it's equal to a single rotation + if (is_rotation[opcode] && rotated[a]) + { + continue; + } + + // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized: + // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations + // 2xXOR(a, b) = NOP + if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16))) + { + continue; + } + + // Find which ALU is available (and when) for this instruction + int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b]; + int alu_index = -1; + while (next_latency < TOTAL_LATENCY) + { + for (int i = op_ALUs[opcode] - 1; i >= 0; --i) + { + if (!alu_busy[next_latency][i]) + { + // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check + if ((opcode == ADD) && alu_busy[next_latency + 1][i]) + { + continue; + } + + // Rotation can only start when previous rotation is finished, so do an additional availability check + if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode])) + { + continue; + } + + alu_index = i; + break; + } + } + if (alu_index >= 0) + { + break; + } + ++next_latency; + } + + // Don't generate instructions that leave some register unchanged for more than 7 cycles + if (next_latency > latency[a] + 7) + { + continue; + } + + next_latency += op_latency[opcode]; + + if (next_latency <= TOTAL_LATENCY) + { + if (is_rotation[opcode]) + { + ++rotate_count; + } + + // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined + alu_busy[next_latency - op_latency[opcode]][alu_index] = true; + latency[a] = next_latency; + + // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple + asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode]; + + rotated[a] = is_rotation[opcode]; + + inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16); + + code[code_size].opcode = opcode; + code[code_size].dst_index = dst_index; + code[code_size].src_index = src_index; + code[code_size].C = 0; + + if (src_index == 8) + { + r8_used = true; + } + + if (opcode == ADD) + { + // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too + alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true; + + // ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C" + check_data(&data_index, sizeof(uint32_t), data, sizeof(data)); + uint32_t t; + memcpy(&t, data + data_index, sizeof(uint32_t)); + code[code_size].C = SWAP32LE(t); + data_index += sizeof(uint32_t); + } + + ++code_size; + if (code_size >= NUM_INSTRUCTIONS_MIN) + { + break; + } + } + else + { + ++num_retries; + } + } + + // ASIC has more execution resources and can extract as much parallelism from the code as possible + // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC + // Get this latency for at least 1 of the 4 registers + const int prev_code_size = code_size; + while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) + { + int min_idx = 0; + int max_idx = 0; + for (int i = 1; i < 4; ++i) + { + if (asic_latency[i] < asic_latency[min_idx]) min_idx = i; + if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; + } + + const uint8_t pattern[3] = { ROR, MUL, MUL }; + const uint8_t opcode = pattern[(code_size - prev_code_size) % 3]; + latency[min_idx] = latency[max_idx] + op_latency[opcode]; + asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode]; + + code[code_size].opcode = opcode; + code[code_size].dst_index = min_idx; + code[code_size].src_index = max_idx; + code[code_size].C = 0; + ++code_size; + } + + // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time + // It never does more than 4 iterations for all block heights < 10,000,000 + } while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); + + // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here + // Add final instruction to stop the interpreter + code[code_size].opcode = RET; + code[code_size].dst_index = 0; + code[code_size].src_index = 0; + code[code_size].C = 0; + + return code_size; +} diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 20203a3c5..064b07339 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -50,6 +50,7 @@ #include #include #include +#include #ifdef _WIN32 #include @@ -239,169 +240,208 @@ bool minethd::self_test() cn_hash_fun hashf; cn_hash_fun hashf_multi; - xmrstak_algo algo = xmrstak_algo::invalid_algo; + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); - for(int algo_idx = 0; algo_idx < 2; ++algo_idx) + for(const auto algo : neededAlgorithms) { - if(algo_idx == 0) - algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(); - else - algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - - if(algo == cryptonight) + if(algo == POW(cryptonight)) { - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf("This is a test", 14, out, ctx); + std::cout<HaveHardwareAes(), false, algo); + hashf("This is a test", 14, out, ctx, algo); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); - hashf("This is a test", 14, out, ctx); + minethd::cn_on_new_job dm; + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("This is a test", 14, out, ctx, algo); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; - hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); + func_multi_selector<2>(hashf_multi, dm, ::jconf::inst()->HaveHardwareAes(), false, algo); + hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo); bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight); - hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx); + func_multi_selector<2>(hashf_multi, dm, ::jconf::inst()->HaveHardwareAes(), true, algo); + hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo); bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; - hashf_multi = func_multi_selector<3>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx); + func_multi_selector<3>(hashf_multi, dm, ::jconf::inst()->HaveHardwareAes(), false, algo); + hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx, algo); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0; - hashf_multi = func_multi_selector<4>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx); + func_multi_selector<4>(hashf_multi, dm, ::jconf::inst()->HaveHardwareAes(), false, algo); + hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0; - hashf_multi = func_multi_selector<5>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight); - hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx); + func_multi_selector<5>(hashf_multi, dm, ::jconf::inst()->HaveHardwareAes(), false, algo); + hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo); bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0; } - else if(algo == cryptonight_lite) + else if(algo == POW(cryptonight_lite)) { - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_lite); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_lite); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; } - else if(algo == cryptonight_monero) + else if(algo == POW(cryptonight_monero)) { - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; } - else if(algo == cryptonight_monero_v8) + else if(algo == POW(cryptonight_monero_v8)) { - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0; - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult &= memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0; } - else if(algo == cryptonight_aeon) + else if(algo == POW(cryptonight_aeon)) { - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_aeon); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; } - else if(algo == cryptonight_ipbc) + else if(algo == POW(cryptonight_ipbc)) { - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_ipbc); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0; - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_ipbc); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0; } - else if(algo == cryptonight_stellite) + else if(algo == POW(cryptonight_stellite)) { - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_stellite); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_stellite); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; } - else if(algo == cryptonight_masari) + else if(algo == POW(cryptonight_masari)) { - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_masari); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_masari); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; } - else if(algo == cryptonight_heavy) + else if(algo == POW(cryptonight_heavy)) { - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_heavy); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_heavy); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; } - else if(algo == cryptonight_haven) + else if(algo == POW(cryptonight_haven)) { - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_haven); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_haven); - hashf("This is a test This is a test This is a test", 44, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; } - else if(algo == cryptonight_bittube2) + else if(algo == POW(cryptonight_bittube2)) { unsigned char out[32 * MAX_N]; cn_hash_fun hashf; - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); - hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx); + hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx, algo); bResult = bResult && memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0; - hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx); + hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx, algo); bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0; - hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx); + hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx, algo); bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0; } - else if(algo == cryptonight_superfast) + else if(algo == POW(cryptonight_superfast)) { - hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_superfast); - hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx); + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx, algo); bResult = bResult && memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0; } + else if(algo == POW(cryptonight_gpu)) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("", 0, out, ctx, algo); + bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("", 0, out, ctx, algo); + bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; + } + else if(algo == POW(cryptonight_conceal)) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("", 0, out, ctx, algo); + bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("", 0, out, ctx, algo); + bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0; + } + else if (algo == POW(cryptonight_turtle)) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); + bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0; + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); + bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0; + } + else if(algo == POW(cryptonight_r)) + { + minethd::cn_on_new_job set_job; + func_multi_selector<1>(hashf, set_job, ::jconf::inst()->HaveHardwareAes(), false, algo); + miner_work work; + work.iBlockHeight = 1806260; + set_job(work, ctx); + hashf("\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74", 44, out, ctx, algo); + bResult = bResult && memcmp(out, "\xf7\x59\x58\x8a\xd5\x7e\x75\x84\x67\x29\x54\x43\xa9\xbd\x71\x49\x0a\xbf\xf8\xe9\xda\xd1\xb9\x5b\x6b\xf2\xf5\xd0\xd7\x83\x87\xbc", 32) == 0; + } + else + printer::inst()->print_msg(L0, + "Cryptonight hash self-test NOT defined for POW %s", algo.Name().c_str()); if(!bResult) printer::inst()->print_msg(L0, @@ -483,7 +523,8 @@ static std::string getAsmName(const uint32_t num_hashes) } template -minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str) +void minethd::func_multi_selector(minethd::cn_hash_fun& hash_fun, minethd::cn_on_new_job& on_new_job, + bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str) { static_assert(N >= 1, "number of threads must be >= 1" ); @@ -492,7 +533,7 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc // function as a two digit binary uint8_t algv; - switch(algo) + switch(algo.Id()) { case cryptonight: algv = 2; @@ -530,6 +571,15 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc case cryptonight_superfast: algv = 11; break; + case cryptonight_gpu: + algv = 12; + break; + case cryptonight_conceal: + algv = 13; + break; + case cryptonight_r: + algv = 14; + break; default: algv = 2; break; @@ -590,22 +640,36 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, - + Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, - Cryptonight_hash::template hash + Cryptonight_hash::template hash, + + Cryptonight_hash_gpu::template hash, + Cryptonight_hash_gpu::template hash, + Cryptonight_hash_gpu::template hash, + Cryptonight_hash_gpu::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash }; std::bitset<2> digit; digit.set(0, !bHaveAes); digit.set(1, !bNoPrefetch); - auto selected_function = func_table[ algv << 2 | digit.to_ulong() ]; - + hash_fun = func_table[ algv << 2 | digit.to_ulong() ]; // check for asm optimized version for cryptonight_v8 - if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes) + if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes && algo.Mem() == CN_MEMORY && algo.Iter() == CN_ITER) { std::string selected_asm = asm_version_str; if(selected_asm == "auto") @@ -617,15 +681,15 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc { // Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx) if(N == 1) - selected_function = Cryptonight_hash_asm<1u, 0u>::template hash; + hash_fun = Cryptonight_hash_asm<1u, 0u>::template hash; else if(N == 2) - selected_function = Cryptonight_hash_asm<2u, 0u>::template hash; + hash_fun = Cryptonight_hash_asm<2u, 0u>::template hash; } // supports only 1 thread per hash if(N == 1 && selected_asm == "amd_avx") { // AMD Ryzen (1xxx and 2xxx series) - selected_function = Cryptonight_hash_asm<1u, 1u>::template hash; + hash_fun = Cryptonight_hash_asm<1u, 1u>::template hash; } if(asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx")) printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str()); @@ -633,13 +697,24 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str()); } } - - return selected_function; + + static const std::unordered_map on_new_job_map = { + {cryptonight_r, Cryptonight_R_generator::template cn_on_new_job}, + }; + + auto it = on_new_job_map.find(algo.Id()); + if (it != on_new_job_map.end()) + on_new_job = it->second; + else + on_new_job = nullptr; } -minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo) +minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo) { - return func_multi_selector<1>(bHaveAes, bNoPrefetch, algo); + minethd::cn_hash_fun fun; + minethd::cn_on_new_job dm; + func_multi_selector<1>(fun, dm, bHaveAes, bNoPrefetch, algo); + return fun; } void minethd::work_main() @@ -719,10 +794,12 @@ void minethd::multiway_work_main() // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); + cn_hash_fun hash_fun_multi; + cn_on_new_job on_new_job; uint8_t version = 0; size_t lastPoolId = 0; + func_multi_selector(hash_fun_multi, on_new_job, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); while (bQuit == 0) { if (oWork.bStall) @@ -754,17 +831,20 @@ void minethd::multiway_work_main() if(new_version >= coinDesc.GetMiningForkVersion()) { miner_algo = coinDesc.GetMiningAlgo(); - hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); + func_multi_selector(hash_fun_multi, on_new_job, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } else { miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun_multi = func_multi_selector(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); + func_multi_selector(hash_fun_multi, on_new_job, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); } lastPoolId = oWork.iPoolId; version = new_version; } + if(on_new_job != nullptr) + on_new_job(oWork, ctx); + while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) { if ((iCount++ & 0x7) == 0) //Store stats every 8*N hashes @@ -787,7 +867,7 @@ void minethd::multiway_work_main() for (size_t i = 0; i < N; i++) *piNonce[i] = iNonce++; - hash_fun_multi(bWorkBlob, oWork.iWorkSize, bHashOut, ctx); + hash_fun_multi(bWorkBlob, oWork.iWorkSize, bHashOut, ctx, miner_algo); for (size_t i = 0; i < N; i++) { diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index eb77749f6..ca89e5b52 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -22,18 +22,20 @@ class minethd : public iBackend static std::vector thread_starter(uint32_t threadOffset, miner_work& pWork); static bool self_test(); - typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**); + typedef void (*cn_on_new_job)(const miner_work&, cryptonight_ctx**); + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&); - static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo); + static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo); static bool thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id); static cryptonight_ctx* minethd_alloc_ctx(); -private: - template - static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "off"); + static void func_multi_selector(minethd::cn_hash_fun& hash_fun, minethd::cn_on_new_job& on_new_job, + bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str = "off"); + private: + minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version); template diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index e905caa9f..00311bb93 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -2,8 +2,12 @@ #include #include #include +#include +#include -enum xmrstak_algo +constexpr size_t start_derived_algo_id = 1000; + +enum xmrstak_algo_id { invalid_algo = 0, cryptonight = 1, @@ -17,213 +21,193 @@ enum xmrstak_algo cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks cryptonight_monero_v8 = 11, - cryptonight_superfast = 12 + cryptonight_superfast = 12, + cryptonight_gpu = 13, + cryptonight_conceal = 14, + cryptonight_r_wow = 15, + cryptonight_r = 16, + + cryptonight_turtle = start_derived_algo_id, + cryptonight_v8_half = (start_derived_algo_id + 1), + cryptonight_v8_zelerius = (start_derived_algo_id + 2) + // please add the algorithm name to get_algo_name() }; -// define aeon settings -constexpr size_t CRYPTONIGHT_LITE_MEMORY = 1 * 1024 * 1024; -constexpr uint32_t CRYPTONIGHT_LITE_MASK = 0xFFFF0; -constexpr uint32_t CRYPTONIGHT_LITE_ITER = 0x40000; - -constexpr size_t CRYPTONIGHT_MEMORY = 2 * 1024 * 1024; -constexpr uint32_t CRYPTONIGHT_MASK = 0x1FFFF0; -constexpr uint32_t CRYPTONIGHT_ITER = 0x80000; - -constexpr size_t CRYPTONIGHT_HEAVY_MEMORY = 4 * 1024 * 1024; -constexpr uint32_t CRYPTONIGHT_HEAVY_MASK = 0x3FFFF0; -constexpr uint32_t CRYPTONIGHT_HEAVY_ITER = 0x40000; - -constexpr uint32_t CRYPTONIGHT_MASARI_ITER = 0x40000; - -constexpr uint32_t CRYPTONIGHT_SUPERFAST_ITER = 0x20000; - -template -inline constexpr size_t cn_select_memory() { return 0; } - -template<> -inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } - -template<> -inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_LITE_MEMORY; } - -template<> -inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } - -template<> -inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } - -template<> -inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_HEAVY_MEMORY; } - -template<> -inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_LITE_MEMORY; } - -template<> -inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_LITE_MEMORY; } - -template<> -inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } - -template<> -inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } - -template<> -inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_HEAVY_MEMORY; } - -template<> -inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_HEAVY_MEMORY; } - -template<> -inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } +/** get name of the algorithm + * + * @param algo mining algorithm + */ +inline std::string get_algo_name(xmrstak_algo_id algo_id) +{ + static std::array base_algo_names = + {{ + "invalid_algo", + "cryptonight", + "cryptonight_lite", + "cryptonight_v7", + "cryptonight_heavy", + "cryptonight_lite_v7", + "cryptonight_lite_v7_xor", + "cryptonight_v7_stellite", + "cryptonight_masari", + "cryptonight_haven", + "cryptonight_bittube2", + "cryptonight_v8", + "cryptonight_superfast", + "cryptonight_gpu", + "cryptonight_conceal", + "cryptonight_r_wow", + "cryptonight_r" + }}; + + static std::array derived_algo_names = + {{ + "cryptonight_turtle", + "cryptonight_v8_half", // used by masari and stellite + "cryptonight_v8_zelerius" + }}; + + + if(algo_id < start_derived_algo_id) + return base_algo_names[algo_id]; + else + return derived_algo_names[algo_id - start_derived_algo_id]; +} -inline size_t cn_select_memory(xmrstak_algo algo) +struct xmrstak_algo { - switch(algo) + xmrstak_algo(xmrstak_algo_id name_id) : algo_name(name_id), base_algo(name_id) + { + } + xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm) : algo_name(name_id), base_algo(algorithm) + { + } + xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration) : algo_name(name_id), base_algo(algorithm), iter(iteration) + { + } + xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory) + { + } + xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory, uint32_t mem_mask) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory), mask(mem_mask) { - case cryptonight_stellite: - case cryptonight_monero: - case cryptonight_monero_v8: - case cryptonight_masari: - case cryptonight: - case cryptonight_superfast: - return CRYPTONIGHT_MEMORY; - case cryptonight_ipbc: - case cryptonight_aeon: - case cryptonight_lite: - return CRYPTONIGHT_LITE_MEMORY; - case cryptonight_bittube2: - case cryptonight_haven: - case cryptonight_heavy: - return CRYPTONIGHT_HEAVY_MEMORY; - default: - return 0; } -} - -template -inline constexpr uint32_t cn_select_mask() { return 0; } - -template<> -inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } - -template<> -inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_LITE_MASK; } - -template<> -inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } - -template<> -inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } - -template<> -inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_HEAVY_MASK; } - -template<> -inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_LITE_MASK; } - -template<> -inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_LITE_MASK; } - -template<> -inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } - -template<> -inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } - -template<> -inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_HEAVY_MASK; } - -template<> -inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_HEAVY_MASK; } - -template<> -inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } -inline size_t cn_select_mask(xmrstak_algo algo) -{ - switch(algo) + /** check if the algorithm is equal to another algorithm + * + * we do not check the member algo_name because this is only an alias name + */ + bool operator==(const xmrstak_algo& other) const { - case cryptonight_stellite: - case cryptonight_monero: - case cryptonight_monero_v8: - case cryptonight_masari: - case cryptonight: - case cryptonight_superfast: - return CRYPTONIGHT_MASK; - case cryptonight_ipbc: - case cryptonight_aeon: - case cryptonight_lite: - return CRYPTONIGHT_LITE_MASK; - case cryptonight_bittube2: - case cryptonight_haven: - case cryptonight_heavy: - return CRYPTONIGHT_HEAVY_MASK; - default: - return 0; + return other.Id() == Id() && other.Mem() == Mem() && other.Iter() == Iter() && other.Mask() == Mask(); } -} -template -inline constexpr uint32_t cn_select_iter() { return 0; } + bool operator==(const xmrstak_algo_id& id) const + { + return base_algo == id; + } -template<> -inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } + operator xmrstak_algo_id() const + { + return base_algo; + } -template<> -inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_LITE_ITER; } + xmrstak_algo_id Id() const + { + return base_algo; + } -template<> -inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } + size_t Mem() const + { + if(base_algo == invalid_algo) + return 0; + else + return mem; + } -template<> -inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } + uint32_t Iter() const + { + return iter; + } -template<> -inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HEAVY_ITER; } + /** Name of the algorithm + * + * This name is only an alias for the native implemented base algorithm. + */ + std::string Name() const + { + return get_algo_name(algo_name); + } -template<> -inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_LITE_ITER; } + /** Name of the parent algorithm + * + * This is the real algorithm which is implemented in all POW functions. + */ + std::string BaseName() const + { + return get_algo_name(base_algo); + } -template<> -inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_LITE_ITER; } + uint32_t Mask() const + { + // default is a 16 byte aligne mask + if(mask == 0) + return ((mem - 1u) / 16) * 16; + else + return mask; + } -template<> -inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } + xmrstak_algo_id algo_name = invalid_algo; + xmrstak_algo_id base_algo = invalid_algo; + uint32_t iter = 0u; + size_t mem = 0u; + uint32_t mask = 0u; +}; -template<> -inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_MASARI_ITER; } +// default cryptonight +constexpr size_t CN_MEMORY = 2 * 1024 * 1024; +constexpr uint32_t CN_ITER = 0x80000; -template<> -inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HEAVY_ITER; } +// crptonight gpu +constexpr uint32_t CN_GPU_MASK = 0x1FFFC0; +constexpr uint32_t CN_GPU_ITER = 0xC000; -template<> -inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HEAVY_ITER; } +// cryptonight turtle (the mask is not using the full 256kib scratchpad) +constexpr uint32_t CN_TURTLE_MASK = 0x1FFF0; -template<> -inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_SUPERFAST_ITER; } +constexpr uint32_t CN_ZELERIUS_ITER = 0x60000; -inline size_t cn_select_iter(xmrstak_algo algo) +inline xmrstak_algo POW(xmrstak_algo_id algo_id) { - switch(algo) - { - case cryptonight_stellite: - case cryptonight_monero: - case cryptonight_monero_v8: - case cryptonight: - return CRYPTONIGHT_ITER; - case cryptonight_ipbc: - case cryptonight_aeon: - case cryptonight_lite: - return CRYPTONIGHT_LITE_ITER; - case cryptonight_bittube2: - case cryptonight_haven: - case cryptonight_heavy: - return CRYPTONIGHT_HEAVY_ITER; - case cryptonight_masari: - return CRYPTONIGHT_MASARI_ITER; - case cryptonight_superfast: - return CRYPTONIGHT_SUPERFAST_ITER; - default: - return 0; - } + static std::array pow = {{ + {invalid_algo, invalid_algo}, + {cryptonight, cryptonight, CN_ITER, CN_MEMORY}, + {cryptonight_lite, cryptonight_lite, CN_ITER/2, CN_MEMORY/2}, + {cryptonight_monero, cryptonight_monero, CN_ITER, CN_MEMORY}, + {cryptonight_heavy, cryptonight_heavy, CN_ITER/2, CN_MEMORY*2}, + {cryptonight_aeon, cryptonight_aeon, CN_ITER/2, CN_MEMORY/2}, + {cryptonight_ipbc, cryptonight_ipbc, CN_ITER/2, CN_MEMORY/2}, // equal to cryptonight_aeon with a small tweak in the miner code + {cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY}, //equal to cryptonight_monero but with one tiny change + {cryptonight_masari, cryptonight_masari, CN_ITER/2, CN_MEMORY}, //equal to cryptonight_monero but with less iterations, used by masari + {cryptonight_haven, cryptonight_haven, CN_ITER/2, CN_MEMORY*2}, // equal to cryptonight_heavy with a small tweak + {cryptonight_bittube2, cryptonight_bittube2, CN_ITER/2, CN_MEMORY*2}, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks + {cryptonight_monero_v8, cryptonight_monero_v8, CN_ITER, CN_MEMORY}, + {cryptonight_superfast, cryptonight_superfast, CN_ITER/4, CN_MEMORY}, + {cryptonight_gpu, cryptonight_gpu, CN_GPU_ITER, CN_MEMORY, CN_GPU_MASK}, + {cryptonight_conceal, cryptonight_conceal, CN_ITER/2, CN_MEMORY}, + {cryptonight_r_wow, cryptonight_r_wow, CN_ITER, CN_MEMORY}, + {cryptonight_r, cryptonight_r, CN_ITER, CN_MEMORY} + }}; + + static std::array derived_pow = + {{ + {cryptonight_turtle, cryptonight_monero_v8, CN_ITER/8, CN_MEMORY/8, CN_TURTLE_MASK}, + {cryptonight_v8_half, cryptonight_monero_v8, CN_ITER/2, CN_MEMORY}, + {cryptonight_v8_zelerius, cryptonight_monero_v8, CN_ZELERIUS_ITER, CN_MEMORY} + // {cryptonight_derived} + }}; + + if(algo_id < start_derived_algo_id) + return pow[algo_id]; + else + return derived_pow[algo_id - start_derived_algo_id]; } diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp index 4eeed3c4b..52ef3f391 100644 --- a/xmrstak/backend/globalStates.cpp +++ b/xmrstak/backend/globalStates.cpp @@ -33,7 +33,7 @@ namespace xmrstak { -void globalStates::consume_work( miner_work& threadWork, uint64_t& currentJobId) +void globalStates::consume_work(miner_work& threadWork, uint64_t& currentJobId) { jobLock.ReadLock(); @@ -43,7 +43,7 @@ void globalStates::consume_work( miner_work& threadWork, uint64_t& currentJobId) jobLock.UnLock(); } -void globalStates::switch_work(miner_work& pWork, pool_data& dat) +void globalStates::switch_work(miner_work&& pWork, pool_data& dat) { jobLock.WriteLock(); @@ -61,7 +61,7 @@ void globalStates::switch_work(miner_work& pWork, pool_data& dat) * after the nonce is read. */ dat.iSavedNonce = iGlobalNonce.exchange(dat.iSavedNonce, std::memory_order_relaxed); - oGlobalWork = pWork; + oGlobalWork = std::move(pWork); jobLock.UnLock(); } diff --git a/xmrstak/backend/globalStates.hpp b/xmrstak/backend/globalStates.hpp index c8d691712..d6966c4a2 100644 --- a/xmrstak/backend/globalStates.hpp +++ b/xmrstak/backend/globalStates.hpp @@ -22,7 +22,7 @@ struct globalStates } //pool_data is in-out winapi style - void switch_work(miner_work& pWork, pool_data& dat); + void switch_work(miner_work&& pWork, pool_data& dat); inline void calc_start_nonce(uint32_t& nonce, bool use_nicehash, uint32_t reserve_count) { diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp index b6456f031..c8174df32 100644 --- a/xmrstak/backend/miner_work.hpp +++ b/xmrstak/backend/miner_work.hpp @@ -21,29 +21,40 @@ namespace xmrstak bool bNiceHash; bool bStall; size_t iPoolId; + uint64_t iBlockHeight; + uint8_t* ref_ptr; - miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(invalid_pool_id) { } + miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(invalid_pool_id), ref_ptr((uint8_t*)&iBlockHeight) { } miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize, - uint64_t iTarget, bool bNiceHash, size_t iPoolId) : iWorkSize(iWorkSize), - iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId) + uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) : iWorkSize(iWorkSize), + iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId), iBlockHeight(iBlockHeiht), ref_ptr((uint8_t*)&iBlockHeight) { assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); memcpy(this->bWorkBlob, bWork, iWorkSize); + memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); + } + + miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget), + bStall(from.bStall), iPoolId(from.iPoolId), iBlockHeight(from.iBlockHeight), ref_ptr((uint8_t*)&iBlockHeight) + { + assert(iWorkSize <= sizeof(bWorkBlob)); + memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); + memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); } miner_work(miner_work const&) = delete; - miner_work& operator=(miner_work const& from) + miner_work& operator=(miner_work&& from) { assert(this != &from); + iBlockHeight = from.iBlockHeight; + iPoolId = from.iPoolId; + bStall = from.bStall; iWorkSize = from.iWorkSize; - iTarget = from.iTarget; bNiceHash = from.bNiceHash; - bStall = from.bStall; - iPoolId = from.iPoolId; + iTarget = from.iTarget; assert(iWorkSize <= sizeof(bWorkBlob)); memcpy(sJobID, from.sJobID, sizeof(sJobID)); @@ -52,23 +63,22 @@ namespace xmrstak return *this; } - miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget), - bStall(from.bStall), iPoolId(from.iPoolId) - { - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(sJobID, from.sJobID, sizeof(sJobID)); - memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); - } - - miner_work& operator=(miner_work&& from) + miner_work& operator=(miner_work const& from) { assert(this != &from); + iBlockHeight = from.iBlockHeight; + iPoolId = from.iPoolId; + bStall = from.bStall; iWorkSize = from.iWorkSize; - iTarget = from.iTarget; bNiceHash = from.bNiceHash; - bStall = from.bStall; - iPoolId = from.iPoolId; + iTarget = from.iTarget; + + if(!ref_ptr) + return *this; + + for(size_t i=0; i <= 7 && iPoolId; i++) + ref_ptr[i] = from.ref_ptr[7-i]; assert(iWorkSize <= sizeof(bWorkBlob)); memcpy(sJobID, from.sJobID, sizeof(sJobID)); diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp new file mode 100644 index 000000000..87eb05540 --- /dev/null +++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp @@ -0,0 +1,336 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#include +#include +#include +#include +#include +#include + +#include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp" +#include "xmrstak/backend/cpu/crypto/variant4_random_math.h" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/cpputil/read_write_lock.h" + +namespace xmrstak +{ +namespace nvidia +{ + +static std::string get_code(const V4_Instruction* code, int code_size) +{ + std::stringstream s; + + for (int i = 0; i < code_size; ++i) + { + const V4_Instruction inst = code[i]; + + const uint32_t a = inst.dst_index; + const uint32_t b = inst.src_index; + + switch (inst.opcode) + { + case MUL: + s << 'r' << a << "*=r" << b << ';'; + break; + + case ADD: + s << 'r' << a << "+=r" << b << '+' << inst.C << "U;"; + break; + + case SUB: + s << 'r' << a << "-=r" << b << ';'; + break; + + case ROR: + s << 'r' << a << "=rotate_right(r" << a << ",r" << b << ");"; + break; + + case ROL: + s << 'r' << a << "=rotate_left(r" << a << ",r" << b << ");"; + break; + + case XOR: + s << 'r' << a << "^=r" << b << ';'; + break; + } + + s << '\n'; + } + + return s.str(); +} + +struct CacheEntry +{ + CacheEntry(xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, const std::vector& ptx, const std::string& lowered_name) : + algo(algo), + height(height), + arch_major(arch_major), + arch_minor(arch_minor), + ptx(ptx), + lowered_name(lowered_name) + {} + + xmrstak_algo algo; + uint64_t height; + int arch_major; + int arch_minor; + std::vector ptx; + std::string lowered_name; +}; + +struct BackgroundTaskBase +{ + virtual ~BackgroundTaskBase() {} + virtual void exec() = 0; +}; + +template +struct BackgroundTask : public BackgroundTaskBase +{ + BackgroundTask(T&& func) : m_func(std::move(func)) {} + void exec() override { m_func(); } + + T m_func; +}; + +static ::cpputil::RWLock CryptonightR_cache_mutex; +static std::mutex CryptonightR_build_mutex; +static std::vector CryptonightR_cache; + +static std::mutex background_tasks_mutex; +static std::vector background_tasks; +static std::thread* background_thread = nullptr; + +static void background_thread_proc() +{ + std::vector tasks; + for (;;) { + tasks.clear(); + { + std::lock_guard g(background_tasks_mutex); + background_tasks.swap(tasks); + } + + for (BackgroundTaskBase* task : tasks) { + task->exec(); + delete task; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } +} + +template +static void background_exec(T&& func) +{ + BackgroundTaskBase* task = new BackgroundTask(std::move(func)); + + std::lock_guard g(background_tasks_mutex); + background_tasks.push_back(task); + if (!background_thread) { + background_thread = new std::thread(background_thread_proc); + } +} + +static void CryptonightR_build_program( + std::vector& ptx, + std::string& lowered_name, + const xmrstak_algo& algo, + uint64_t height, + int arch_major, + int arch_minor, + std::string source) +{ + { + CryptonightR_cache_mutex.WriteLock(); + + // Remove old programs from cache + for (size_t i = 0; i < CryptonightR_cache.size();) + { + const CacheEntry& entry = CryptonightR_cache[i]; + if ((entry.algo == algo) && (entry.height + 2 < height)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height); + CryptonightR_cache[i] = std::move(CryptonightR_cache.back()); + CryptonightR_cache.pop_back(); + } + else + { + ++i; + } + } + CryptonightR_cache_mutex.UnLock(); + } + + ptx.clear(); + ptx.reserve(65536); + + std::lock_guard g1(CryptonightR_build_mutex); + { + CryptonightR_cache_mutex.ReadLock(); + + // Check if the cache already has this program (some other thread might have added it first) + for (const CacheEntry& entry : CryptonightR_cache) + { + if ((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor)) + { + ptx = entry.ptx; + lowered_name = entry.lowered_name; + CryptonightR_cache_mutex.UnLock(); + return; + } + } + CryptonightR_cache_mutex.UnLock(); + } + + nvrtcProgram prog; + nvrtcResult result = nvrtcCreateProgram(&prog, source.c_str(), "CryptonightR.curt", 0, NULL, NULL); + if (result != NVRTC_SUCCESS) { + printer::inst()->print_msg(L0, "nvrtcCreateProgram failed: %s", nvrtcGetErrorString(result)); + return; + } + + result = nvrtcAddNameExpression(prog, "CryptonightR_phase2"); + if (result != NVRTC_SUCCESS) { + printer::inst()->print_msg(L0, "nvrtcAddNameExpression failed: %s", nvrtcGetErrorString(result)); + nvrtcDestroyProgram(&prog); + return; + } + + char opt0[64]; + sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor); + + char opt1[64]; + sprintf(opt1, "-DALGO=%d", static_cast(algo.Id())); + + const char* opts[2] = { opt0, opt1 }; + + result = nvrtcCompileProgram(prog, 2, opts); + if (result != NVRTC_SUCCESS) { + printer::inst()->print_msg(L0, "nvrtcCompileProgram failed: %s", nvrtcGetErrorString(result)); + + size_t logSize; + if (nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS) { + char *log = new char[logSize]; + if (nvrtcGetProgramLog(prog, log) == NVRTC_SUCCESS) { + printer::inst()->print_msg(L0, "Program compile log: %s", log); + } + delete[]log; + } + nvrtcDestroyProgram(&prog); + return; + } + + + const char* name; + result = nvrtcGetLoweredName(prog, "CryptonightR_phase2", &name); + if (result != NVRTC_SUCCESS) { + printer::inst()->print_msg(L0, "nvrtcGetLoweredName failed: %s", nvrtcGetErrorString(result)); + nvrtcDestroyProgram(&prog); + return; + } + + size_t ptxSize; + result = nvrtcGetPTXSize(prog, &ptxSize); + if (result != NVRTC_SUCCESS) { + printer::inst()->print_msg(L0, "nvrtcGetPTXSize failed: %s", nvrtcGetErrorString(result)); + nvrtcDestroyProgram(&prog); + return; + } + + ptx.resize(ptxSize); + result = nvrtcGetPTX(prog, ptx.data()); + if (result != NVRTC_SUCCESS) { + printer::inst()->print_msg(L0, "nvrtcGetPTX failed: %s", nvrtcGetErrorString(result)); + nvrtcDestroyProgram(&prog); + return; + } + + lowered_name = name; + + nvrtcDestroyProgram(&prog); + + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height); + + CryptonightR_cache_mutex.WriteLock(); + CryptonightR_cache.emplace_back(algo, height, arch_major, arch_minor, ptx, lowered_name); + CryptonightR_cache_mutex.UnLock(); +} + +void CryptonightR_get_program(std::vector& ptx, std::string& lowered_name, const xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, bool background) +{ + if (background) { + background_exec([=]() { std::vector tmp; std::string s; CryptonightR_get_program(tmp, s, algo, height, arch_major, arch_minor, false); }); + return; + } + + ptx.clear(); + + const char* source_code_template = + #include "nvcc_code/cuda_cryptonight_r.curt" + ; + const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH"; + const char* offset = strstr(source_code_template, include_name); + if (!offset) + { + printer::inst()->print_msg(L0, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cuda_cryptonight_r.curt"); + return; + } + + V4_Instruction code[256]; + int code_size; + switch (algo.Id()) + { + case cryptonight_r_wow: + code_size = v4_random_math_init(code, height); + break; + case cryptonight_r: + code_size = v4_random_math_init(code, height); + break; + printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo); + return; + } + + std::string source_code(source_code_template, offset); + source_code.append(get_code(code, code_size)); + source_code.append(offset + sizeof(include_name) - 1); + + { + CryptonightR_cache_mutex.ReadLock(); + + // Check if the cache has this program + for (const CacheEntry& entry : CryptonightR_cache) + { + if ((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height); + ptx = entry.ptx; + lowered_name = entry.lowered_name; + CryptonightR_cache_mutex.UnLock(); + return; + } + } + CryptonightR_cache_mutex.UnLock(); + } + + CryptonightR_build_program(ptx, lowered_name, algo, height, arch_major, arch_minor, source_code); +} + +} // namespace xmrstak +} //namespace nvidia diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp new file mode 100644 index 000000000..e214647b9 --- /dev/null +++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp @@ -0,0 +1,37 @@ +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#pragma once + +#include "xmrstak/backend/cryptonight.hpp" + +#include +#include +#include + + +namespace xmrstak +{ +namespace nvidia +{ + +void CryptonightR_get_program(std::vector& ptx, std::string& lowered_name, + const xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, bool background = false); + + +} // namespace xmrstak +} //namespace nvidia + diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 6460628de..794e68d11 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -173,6 +173,8 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor size_t i, n = jconf::inst()->GetGPUThreadCount(); pvThreads->reserve(n); + cuInit(0); + jconf::thd_cfg cfg; for (i = 0; i < n; i++) { @@ -226,7 +228,10 @@ void minethd::work_main() // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot(); - cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); + + cn_hash_fun hash_fun; + cpu::minethd::cn_on_new_job set_job; + cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); uint32_t iNonce; @@ -255,17 +260,20 @@ void minethd::work_main() if(new_version >= coinDesc.GetMiningForkVersion()) { miner_algo = coinDesc.GetMiningAlgo(); - hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); + cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); } else { miner_algo = coinDesc.GetMiningAlgoRoot(); - hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); + cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo); } lastPoolId = oWork.iPoolId; version = new_version; } + if(set_job != nullptr) + set_job(oWork, &cpu_ctx); + cryptonight_extra_cpu_set_data(&ctx, oWork.bWorkBlob, oWork.iWorkSize); uint32_t h_per_round = ctx.device_blocks * ctx.device_threads; @@ -292,7 +300,7 @@ void minethd::work_main() cryptonight_extra_cpu_prepare(&ctx, iNonce, miner_algo); - cryptonight_core_cpu_hash(&ctx, miner_algo, iNonce); + cryptonight_core_cpu_hash(&ctx, miner_algo, iNonce, cpu_ctx->cn_r_ctx.height); cryptonight_extra_cpu_final(&ctx, iNonce, oWork.iTarget, &foundCount, foundNonce, miner_algo); @@ -307,7 +315,7 @@ void minethd::work_main() *(uint32_t*)(bWorkBlob + 39) = foundNonce[i]; - hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx); + hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo); if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo, miner_algo), oWork.iPoolId)); else diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp index 389356842..3863c93e8 100644 --- a/xmrstak/backend/nvidia/minethd.hpp +++ b/xmrstak/backend/nvidia/minethd.hpp @@ -28,7 +28,7 @@ class minethd : public iBackend static bool self_test(); private: - typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**); + typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&); minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg); void start_mining(); diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp index 8fda8d401..fe77b6f81 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp @@ -6,6 +6,8 @@ #include "xmrstak/jconf.hpp" #include "xmrstak/backend/cryptonight.hpp" +#include + typedef struct { int device_id; const char *device_name; @@ -33,6 +35,13 @@ typedef struct { std::string name; size_t free_device_memory; size_t total_device_memory; + + CUdevice cuDevice; + CUcontext cuContext; + CUmodule module = nullptr; + CUfunction kernel = nullptr; + uint64_t kernel_height = 0; + xmrstak_algo cached_algo = {xmrstak_algo_id::invalid_algo}; } nvid_ctx; extern "C" { @@ -46,8 +55,8 @@ int cuda_get_devicecount( int* deviceCount); int cuda_get_deviceinfo(nvid_ctx *ctx); int cryptonight_extra_cpu_init(nvid_ctx *ctx); void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len); -void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, xmrstak_algo miner_algo); -void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce,xmrstak_algo miner_algo); +void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo); +void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo); } -void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce); +void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, uint32_t startNonce, uint64_t chain_height); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 87c1befa8..d082f3362 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -10,6 +10,8 @@ #include "xmrstak/jconf.hpp" #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp" +#include "xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp" +#include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp" #ifdef _WIN32 @@ -127,8 +129,9 @@ __device__ __forceinline__ uint32_t rotate16( const uint32_t n ) return (n >> 16u) | (n << 16u); } -template -__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 ) +__global__ void cryptonight_core_gpu_phase1( + const uint32_t ITERATIONS, const size_t MEMORY, + int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 ) { __shared__ uint32_t sharedMemory[1024]; @@ -266,11 +269,13 @@ struct u64 : public uint2 * @tparam MEM_MODE if `0` than 64bit memory transfers per thread will be used to store/load data within shared memory * else if `1` 256bit operations will be used */ -template +template #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 2 ) #endif -__global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, +__global__ void cryptonight_core_gpu_phase2_double( + const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, + int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, uint32_t startNonce, uint32_t * __restrict__ d_input ) { __shared__ uint32_t sharedMemory[512]; @@ -312,11 +317,11 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in uint64_t division_result; if(ALGO == cryptonight_monero_v8) { - bx0 = ((uint64_t*)(d_ctx_b + thread * 12))[sub]; - bx1 = ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub]; + bx0 = ((uint64_t*)(d_ctx_b + thread * 16))[sub]; + bx1 = ((uint64_t*)(d_ctx_b + thread * 16 + 4))[sub]; - division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0]; - sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0]; + division_result = ((uint64_t*)(d_ctx_b + thread * 16 + 4 * 2))[0]; + sqrt_result = (d_ctx_b + thread * 16 + 4 * 2 + 2)[0]; } else bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub]; @@ -400,7 +405,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in ((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0); ((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0); - if(ALGO == cryptonight_monero_v8 && sub == 1) + if((ALGO == cryptonight_monero_v8) && sub == 1) { // Use division and square root results from the _previous_ iteration to hide the latency ((uint32_t*)&division_result)[1] ^= sqrt_result; @@ -466,14 +471,14 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in ((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0; if(ALGO == cryptonight_monero_v8) { - ((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0; - ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub] = bx1; + ((uint64_t*)(d_ctx_b + thread * 16))[sub] = bx0; + ((uint64_t*)(d_ctx_b + thread * 16 + 4))[sub] = bx1; if(sub == 1) { // must be valid only for `sub == 1` - ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0] = division_result; - (d_ctx_b + thread * 12 + 4 * 2 + 2)[0] = sqrt_result; + ((uint64_t*)(d_ctx_b + thread * 16 + 4 * 2))[0] = division_result; + (d_ctx_b + thread * 16 + 4 * 2 + 2)[0] = sqrt_result; } } else @@ -481,11 +486,13 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in } } -template +template #ifdef XMR_STAK_THREADS __launch_bounds__( XMR_STAK_THREADS * 4 ) #endif -__global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, +__global__ void cryptonight_core_gpu_phase2_quad( + const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, + int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, uint32_t startNonce, uint32_t * __restrict__ d_input ) { __shared__ uint32_t sharedMemory[1024]; @@ -517,6 +524,15 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int uint32_t a, d[2], idx0; uint32_t t1[2], t2[2], res; + float conc_var; + if(ALGO == cryptonight_conceal) + { + if(partidx != 0) + conc_var = int_as_float(*(d_ctx_b + threads * 4 + thread * 4 + sub)); + else + conc_var = 0.0f; + } + uint32_t tweak1_2[2]; if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) { @@ -579,7 +595,23 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int } else { - const uint32_t x_0 = loadGlobal32( long_state + j ); + uint32_t x_0 = loadGlobal32( long_state + j ); + + if(ALGO == cryptonight_conceal) + { + float r = int2float((int32_t)x_0); + float c_old = conc_var; + + r += conc_var; + r = r * r * r; + r = int_as_float((float_as_int(r) & 0x807FFFFF) | 0x40000000); + conc_var += r; + + c_old = int_as_float((float_as_int(c_old) & 0x807FFFFF) | 0x40000000); + c_old *= 536870880.0f; + x_0 = (uint32_t)(((int32_t)x_0) ^ ((int32_t)c_old)); + } + const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1); const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2); const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3); @@ -681,11 +713,15 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) if(sub&1) *(d_ctx_b + threads * 4 + thread) = idx0; + if(ALGO == cryptonight_conceal) + *(d_ctx_b + threads * 4 + thread * 4 + sub) = float_as_int(conc_var); } } -template -__global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 ) +template +__global__ void cryptonight_core_gpu_phase3( + const uint32_t ITERATIONS, const size_t MEMORY, + int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 ) { __shared__ uint32_t sharedMemory[1024]; @@ -724,7 +760,8 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti cn_aes_pseudo_round_mut( sharedMemory, text, key ); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { #pragma unroll for ( int j = 0; j < 4; ++j ) @@ -735,9 +772,13 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 ); } -template -void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) +template +void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo) { + uint32_t MASK = algo.Mask(); + uint32_t ITERATIONS = algo.Iter(); + size_t MEM = algo.Mem()/4; + dim3 grid( ctx->device_blocks ); dim3 block( ctx->device_threads ); dim3 block2( ctx->device_threads << 1 ); @@ -759,7 +800,10 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) for ( int i = 0; i < partcountOneThree; i++ ) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads, + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>( + ITERATIONS, + MEM, + ctx->device_blocks*ctx->device_threads, bfactorOneThree, i, ctx->d_long_state, (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state), @@ -777,13 +821,16 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", - cryptonight_core_gpu_phase2_double<<< + cryptonight_core_gpu_phase2_double<<< grid, block2, - sizeof(uint64_t) * block2.x * 8 + + sizeof(uint64_t) * block.x * 8 + // shuffle memory for fermi gpus block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) >>>( + ITERATIONS, + MEM, + MASK, ctx->device_blocks*ctx->device_threads, ctx->device_bfactor, i, @@ -796,16 +843,39 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) ) ); } + else if(ALGO == cryptonight_r_wow || ALGO == cryptonight_r) + { + int numThreads = ctx->device_blocks*ctx->device_threads; + void* args[] = { + &ITERATIONS, &MEM, &MASK, + &numThreads, &ctx->device_bfactor, &i, + &ctx->d_long_state, &ctx->d_ctx_a, &ctx->d_ctx_b, &ctx->d_ctx_state, &nonce, &ctx->d_input + }; + CU_CHECK(ctx->device_id, cuLaunchKernel( + ctx->kernel, + grid.x, grid.y, grid.z, + block2.x, block2.y, block2.z, + sizeof(uint64_t) * block.x * 8 + + // shuffle memory for fermi gpus + block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ), + nullptr, + args, 0 + )); + CU_CHECK(ctx->device_id, cuCtxSynchronize()); + } else { CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**", - cryptonight_core_gpu_phase2_quad<<< + cryptonight_core_gpu_phase2_quad<<< grid, block4, block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) >>>( + ITERATIONS, + MEM, + MASK, ctx->device_blocks*ctx->device_threads, ctx->device_bfactor, i, @@ -832,64 +902,179 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) for ( int i = 0; i < roundsPhase3; i++ ) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< + grid, + block8, + block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) + >>>( + ITERATIONS, + MEM, + ctx->device_blocks*ctx->device_threads, + bfactorOneThree, i, + ctx->d_long_state, + ctx->d_ctx_state, ctx->d_ctx_key2 )); + } +} + +template +void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo) +{ + const uint32_t MASK = algo.Mask(); + const uint32_t ITERATIONS = algo.Iter(); + const size_t MEM = algo.Mem(); + + dim3 grid( ctx->device_blocks ); + dim3 block( ctx->device_threads ); + dim3 block2( ctx->device_threads << 1 ); + dim3 block4( ctx->device_threads << 2 ); + dim3 block8( ctx->device_threads << 3 ); + + size_t intensity = ctx->device_blocks * ctx->device_threads; + + CUDA_CHECK_KERNEL( + ctx->device_id, + xmrstak::nvidia::cn_explode_gpu<<>>(MEM, (int*)ctx->d_ctx_state, (int*)ctx->d_long_state) + ); + + int partcount = 1 << ctx->device_bfactor; + for(int i = 0; i < partcount; i++) + { + CUDA_CHECK_KERNEL( + ctx->device_id, + // 36 x 16byte x numThreads + xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu + <<device_blocks, ctx->device_threads * 16, 32 * 16 * ctx->device_threads>>> + ( + ITERATIONS, + MEM, + MASK, + (int*)ctx->d_ctx_state, + (int*)ctx->d_long_state, + ctx->device_bfactor, + i, + ctx->d_ctx_a, + ctx->d_ctx_b + ) + ); + } + + /* bfactor for phase 3 + * + * 3 consume less time than phase 2, therefore we begin with the + * kernel splitting if the user defined a `bfactor >= 5` + */ + int bfactorOneThree = ctx->device_bfactor - 4; + if( bfactorOneThree < 0 ) + bfactorOneThree = 0; + + int partcountOneThree = 1 << bfactorOneThree; + int roundsPhase3 = partcountOneThree; + + if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ) + { + // cryptonight_heavy used two full rounds over the scratchpad memory + roundsPhase3 *= 2; + } + + for ( int i = 0; i < roundsPhase3; i++ ) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< grid, block8, block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) - >>>( ctx->device_blocks*ctx->device_threads, + >>>( + ITERATIONS, + MEM/4, + ctx->device_blocks*ctx->device_threads, bfactorOneThree, i, ctx->d_long_state, ctx->d_ctx_state, ctx->d_ctx_key2 )); } } -void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce) +void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, uint32_t startNonce, uint64_t chain_height) { - typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce); + + if((miner_algo == cryptonight_r_wow) || (miner_algo == cryptonight_r)) + { + if(ctx->kernel_height != chain_height || ctx->cached_algo != miner_algo) + { + if(ctx->module) + cuModuleUnload(ctx->module); + + std::vector ptx; + std::string lowered_name; + xmrstak::nvidia::CryptonightR_get_program(ptx, lowered_name, miner_algo, chain_height, ctx->device_arch[0], ctx->device_arch[1]); + + CU_CHECK(ctx->device_id, cuModuleLoadDataEx(&ctx->module, ptx.data(), 0, 0, 0)); + CU_CHECK(ctx->device_id, cuModuleGetFunction(&ctx->kernel, ctx->module, lowered_name.c_str())); + + ctx->kernel_height = chain_height; + ctx->cached_algo = miner_algo; + + xmrstak::nvidia::CryptonightR_get_program(ptx, lowered_name, miner_algo, chain_height + 1, ctx->device_arch[0], ctx->device_arch[1], true); + } + } + + typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo); if(miner_algo == invalid_algo) return; static const cuda_hash_fn func_table[] = { - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash, - - cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash + cryptonight_core_gpu_hash_gpu, + cryptonight_core_gpu_hash_gpu, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash }; std::bitset<1> digit; digit.set(0, ctx->memMode == 1); cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ]; - selected_function(ctx, startNonce); + selected_function(ctx, startNonce, miner_algo); + } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp new file mode 100644 index 000000000..fee7e13d1 --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp @@ -0,0 +1,564 @@ +#pragma once + +#include +#include +#include + +#include "cuda_keccak.hpp" +#include "cuda_extra.hpp" + +namespace xmrstak +{ +namespace nvidia +{ + +struct __m128i : public int4 +{ + + __forceinline__ __device__ __m128i(){} + + __forceinline__ __device__ __m128i( + const uint32_t x0, const uint32_t x1, + const uint32_t x2, const uint32_t x3) + { + x = x0; + y = x1; + z = x2; + w = x3; + } + + __forceinline__ __device__ __m128i( const int x0) + { + x = x0; + y = x0; + z = x0; + w = x0; + } + + __forceinline__ __device__ __m128i operator|(const __m128i& other) + { + return __m128i( + x | other.x, + y | other.y, + z | other.z, + w | other.w + ); + } + + __forceinline__ __device__ __m128i operator^(const __m128i& other) + { + return __m128i( + x ^ other.x, + y ^ other.y, + z ^ other.z, + w ^ other.w + ); + } +}; + +struct __m128 : public float4 +{ + + __forceinline__ __device__ __m128(){} + + __forceinline__ __device__ __m128( + const float x0, const float x1, + const float x2, const float x3) + { + float4::x = x0; + float4::y = x1; + float4::z = x2; + float4::w = x3; + } + + __forceinline__ __device__ __m128( const float x0) + { + float4::x = x0; + float4::y = x0; + float4::z = x0; + float4::w = x0; + } + + __forceinline__ __device__ __m128( const __m128i& x0) + { + float4::x = int2float(x0.x); + float4::y = int2float(x0.y); + float4::z = int2float(x0.z); + float4::w = int2float(x0.w); + } + + __forceinline__ __device__ __m128i get_int( ) + { + return __m128i( + (int)x, + (int)y, + (int)z, + (int)w + ); + } + + __forceinline__ __device__ __m128 operator+(const __m128& other) + { + return __m128( + x + other.x, + y + other.y, + z + other.z, + w + other.w + ); + } + + __forceinline__ __device__ __m128 operator-(const __m128& other) + { + return __m128( + x - other.x, + y - other.y, + z - other.z, + w - other.w + ); + } + + __forceinline__ __device__ __m128 operator*(const __m128& other) + { + return __m128( + x * other.x, + y * other.y, + z * other.z, + w * other.w + ); + } + + __forceinline__ __device__ __m128 operator/(const __m128& other) + { + return __m128( + x / other.x, + y / other.y, + z / other.z, + w / other.w + ); + } + + __forceinline__ __device__ __m128& trunc() + { + x=::truncf(x); + y=::truncf(y); + z=::truncf(z); + w=::truncf(w); + + return *this; + } + + __forceinline__ __device__ __m128& abs() + { + x=::fabsf(x); + y=::fabsf(y); + z=::fabsf(z); + w=::fabsf(w); + + return *this; + } + + __forceinline__ __device__ __m128& floor() + { + x=::floorf(x); + y=::floorf(y); + z=::floorf(z); + w=::floorf(w); + + return *this; + } +}; + + +template +__device__ void print(const char* name, T value) +{ + printf("g %s: ", name); + for(int i = 0; i < 4; ++i) + { + printf("%08X ",((uint32_t*)&value)[i]); + } + printf("\n"); +} + +template<> +__device__ void print<__m128>(const char* name, __m128 value) +{ + printf("g %s: ", name); + for(int i = 0; i < 4; ++i) + { + printf("%f ",((float*)&value)[i]); + } + printf("\n"); +} + +#define SHOW(name) print(#name, name) + + +__forceinline__ __device__ __m128 _mm_add_ps(__m128 a, __m128 b) +{ + return a + b; +} + +__forceinline__ __device__ __m128 _mm_sub_ps(__m128 a, __m128 b) +{ + return a - b; +} + +__forceinline__ __device__ __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + return a * b; +} + +__forceinline__ __device__ __m128 _mm_div_ps(__m128 a, __m128 b) +{ + return a / b; +} + +__forceinline__ __device__ __m128 _mm_and_ps(__m128 a, int b) +{ + return __m128( + int_as_float(float_as_int(a.x) & b), + int_as_float(float_as_int(a.y) & b), + int_as_float(float_as_int(a.z) & b), + int_as_float(float_as_int(a.w) & b) + ); +} + +__forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b) +{ + return __m128( + int_as_float(float_as_int(a.x) | b), + int_as_float(float_as_int(a.y) | b), + int_as_float(float_as_int(a.z) | b), + int_as_float(float_as_int(a.w) | b) + ); +} + +__forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b) +{ + return __m128( + int_as_float(float_as_int(a.x) ^ b), + int_as_float(float_as_int(a.y) ^ b), + int_as_float(float_as_int(a.z) ^ b), + int_as_float(float_as_int(a.w) ^ b) + ); +} + +__forceinline__ __device__ __m128 _mm_fmod_ps(__m128 v, float dc) +{ + __m128 d(dc); + __m128 c = _mm_div_ps(v, d); + c.trunc();//_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); + // c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c)); - sse2 + c = _mm_mul_ps(c, d); + return _mm_sub_ps(v, c); + + + //return a.fmodf(b); +} + +__forceinline__ __device__ __m128i _mm_xor_si128(__m128i a, __m128i b) +{ + return a ^ b; +} + + +__forceinline__ __device__ __m128i _mm_alignr_epi8(__m128i a, const uint32_t rot) +{ + const uint32_t right = 8 * rot; + const uint32_t left = (32 - 8 * rot); + return __m128i( + ((uint32_t)a.x >> right) | ( a.y << left ), + ((uint32_t)a.y >> right) | ( a.z << left ), + ((uint32_t)a.z >> right) | ( a.w << left ), + ((uint32_t)a.w >> right) | ( a.x << left ) + ); +} + +__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int *lpad, const uint32_t MASK) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); } + + +__forceinline__ __device__ __m128 fma_break(__m128 x) +{ + // Break the dependency chain by setitng the exp to ?????01 + x = _mm_and_ps(x, 0xFEFFFFFF); + return _mm_or_ps(x, 0x00800000); +} + +// 9 +__forceinline__ __device__ void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c) +{ + n1 = _mm_add_ps(n1, c); + __m128 nn = _mm_mul_ps(n0, c); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = fma_break(nn); + n = _mm_add_ps(n, nn); + + n3 = _mm_sub_ps(n3, c); + __m128 dd = _mm_mul_ps(n2, c); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = fma_break(dd); + d = _mm_add_ps(d, dd); + + //Constant feedback + c = _mm_add_ps(c, rnd_c); + c = _mm_add_ps(c, 0.734375f); + __m128 r = _mm_add_ps(nn, dd); + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + c = _mm_add_ps(c, r); +} + +// 9*8 + 2 = 74 +__forceinline__ __device__ void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r) +{ + __m128 n(0.0f), d(0.0f); + + sub_round(n0, n1, n2, n3, rnd_c, n, d, c); + sub_round(n1, n2, n3, n0, rnd_c, n, d, c); + sub_round(n2, n3, n0, n1, rnd_c, n, d, c); + sub_round(n3, n0, n1, n2, rnd_c, n, d, c); + sub_round(n3, n2, n1, n0, rnd_c, n, d, c); + sub_round(n2, n1, n0, n3, rnd_c, n, d, c); + sub_round(n1, n0, n3, n2, rnd_c, n, d, c); + sub_round(n0, n3, n2, n1, rnd_c, n, d, c); + + // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 + d = _mm_and_ps(d, 0xFF7FFFFF); + d = _mm_or_ps(d, 0x40000000); + r =_mm_add_ps(r, _mm_div_ps(n,d)); +} + +// 74*8 = 595 +__forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) +{ + __m128 c(cnt); + // 35 maths calls follow (140 FLOPS) + __m128 r = __m128(0.0f); + for(int i=0; i< 4; ++i) + round_compute(n0, n1, n2, n3, rnd_c, c, r); + // do a quick fmod by setting exp to 2 + r = _mm_and_ps(r, 0x807FFFFF); + r = _mm_or_ps(r, 0x40000000); + sum = r; // 34 + r = _mm_mul_ps(r, __m128(536870880.0f)); // 35 + return r.get_int(); + +} + +__forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) +{ + __m128 n0(v0); + __m128 n1(v1); + __m128 n2(v2); + __m128 n3(v3); + + __m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); + out = rot == 0 ? r : _mm_alignr_epi8(r, rot); +} + +__constant__ uint32_t look[16][4] = { + {0, 1, 2, 3}, + {0, 2, 3, 1}, + {0, 3, 1, 2}, + {0, 3, 2, 1}, + + {1, 0, 2, 3}, + {1, 2, 3, 0}, + {1, 3, 0, 2}, + {1, 3, 2, 0}, + + {2, 1, 0, 3}, + {2, 0, 3, 1}, + {2, 3, 1, 0}, + {2, 3, 0, 1}, + + {3, 1, 2, 0}, + {3, 2, 0, 1}, + {3, 0, 1, 2}, + {3, 0, 2, 1} +}; + +__constant__ float ccnt[16] = { + 1.34375f, + 1.28125f, + 1.359375f, + 1.3671875f, + + 1.4296875f, + 1.3984375f, + 1.3828125f, + 1.3046875f, + + 1.4140625f, + 1.2734375f, + 1.2578125f, + 1.2890625f, + + 1.3203125f, + 1.3515625f, + 1.3359375f, + 1.4609375f +}; + + +__forceinline__ __device__ void sync() +{ +#if (__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); +#else + __syncthreads( ); +#endif +} + +struct SharedMemChunk +{ + __m128i out[16]; + __m128 va[16]; +}; + +__global__ void cryptonight_core_gpu_phase2_gpu( + const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, + int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS) +{ + + const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); + + extern __shared__ SharedMemChunk smemExtern_in[]; + + const uint32_t chunk = threadIdx.x / 16; + const uint32_t numHashPerBlock = blockDim.x / 16; + + int* lpad = (int*)((uint8_t*)lpad_in + size_t(MEMORY) * (blockIdx.x * numHashPerBlock + chunk)); + + SharedMemChunk* smem = smemExtern_in + chunk; + + uint32_t tid = threadIdx.x % 16; + + const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x/16; + uint32_t s = 0; + + __m128 vs(0); + if(partidx != 0) + { + vs = ((__m128*)roundVs)[idxHash]; + s = roundS[idxHash]; + } + else + { + s = ((uint32_t*)spad)[idxHash * 50] >> 8; + } + + // tid divided + const uint32_t tidd = tid / 4; + // tid modulo + const uint32_t tidm = tid % 4; + const uint32_t block = tidd * 16 + tidm; + + for(size_t i = 0; i < batchsize; i++) + { + sync(); + int tmp = ((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm]; + ((int*)smem->out)[tid] = tmp; + sync(); + + __m128 rc = vs; + single_comupte_wrap( + tidm, + *(smem->out + look[tid][0]), + *(smem->out + look[tid][1]), + *(smem->out + look[tid][2]), + *(smem->out + look[tid][3]), + ccnt[tid], rc, smem->va[tid], + smem->out[tid] + ); + + sync(); + + int outXor = ((int*)smem->out)[block]; + for(uint32_t dd = block + 4; dd < (tidd + 1) * 16; dd += 4) + outXor ^= ((int*)smem->out)[dd]; + + ((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm] = outXor ^ tmp; + ((int*)smem->out)[tid] = outXor; + + float va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4]; + float va_tmp2 = ((float*)smem->va)[block+ 8] + ((float*)smem->va)[block + 12]; + ((float*)smem->va)[tid] = va_tmp1 + va_tmp2; + + sync(); + + __m128i out2 = smem->out[0] ^ smem->out[1] ^ smem->out[2] ^ smem->out[3]; + va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4]; + va_tmp2 = ((float*)smem->va)[block + 8] + ((float*)smem->va)[block + 12]; + ((float*)smem->va)[tid] = va_tmp1 + va_tmp2; + + sync(); + + vs = smem->va[0]; + vs.abs(); // take abs(va) by masking the float sign bit + auto xx = _mm_mul_ps(vs, __m128(16777216.0f)); + // vs range 0 - 64 + auto xx_int = xx.get_int(); + out2 = _mm_xor_si128(xx_int, out2); + // vs is now between 0 and 1 + vs = _mm_div_ps(vs, __m128(64.0f)); + s = out2.x ^ out2.y ^ out2.z ^ out2.w; + } + if(partidx != ((1<. + * + */ + +#define cryptonight_r_wow 15 +#define cryptonight_r 16 + +typedef unsigned char uint8_t; +typedef unsigned int uint32_t; +typedef unsigned long long int uint64_t; + +static __constant__ uint32_t d_t_fn[1024] = +{ + 0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U, + 0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U, + 0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U, + 0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU, + 0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU, + 0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU, + 0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U, + 0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU, + 0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU, + 0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U, + 0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U, + 0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU, + 0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU, + 0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU, + 0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU, + 0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU, + 0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U, + 0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU, + 0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU, + 0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U, + 0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U, + 0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U, + 0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U, + 0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U, + 0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU, + 0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U, + 0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU, + 0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU, + 0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U, + 0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U, + 0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U, + 0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU, + 0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U, + 0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU, + 0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU, + 0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U, + 0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U, + 0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU, + 0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U, + 0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU, + 0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U, + 0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U, + 0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U, + 0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U, + 0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU, + 0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U, + 0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU, + 0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U, + 0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU, + 0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U, + 0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU, + 0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU, + 0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU, + 0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU, + 0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U, + 0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U, + 0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U, + 0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U, + 0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U, + 0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U, + 0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU, + 0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U, + 0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU, + 0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU, + 0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU, + 0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U, + 0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU, + 0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU, + 0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U, + 0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU, + 0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU, + 0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU, + 0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU, + 0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU, + 0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U, + 0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU, + 0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU, + 0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U, + 0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU, + 0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU, + 0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU, + 0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU, + 0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU, + 0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U, + 0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU, + 0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU, + 0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU, + 0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU, + 0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U, + 0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U, + 0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U, + 0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U, + 0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU, + 0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U, + 0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U, + 0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU, + 0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU, + 0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U, + 0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U, + 0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U, + 0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU, + 0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U, + 0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU, + 0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U, + 0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU, + 0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U, + 0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U, + 0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU, + 0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U, + 0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U, + 0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U, + 0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U, + 0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U, + 0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U, + 0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U, + 0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U, + 0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU, + 0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U, + 0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U, + 0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U, + 0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U, + 0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U, + 0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U, + 0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU, + 0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U, + 0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U, + 0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U, + 0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU, + 0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU, + 0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U, + 0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU, + 0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U, + 0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU, + 0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U, + 0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU, + 0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U, + 0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U, + 0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU, + 0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U, + 0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U, + 0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U, + 0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU, + 0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U, + 0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U, + 0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU, + 0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U, + 0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U, + 0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U, + 0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU, + 0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU, + 0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U, + 0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU, + 0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU, + 0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U, + 0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU, + 0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U, + 0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU, + 0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U, + 0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U, + 0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U, + 0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU, + 0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U, + 0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU, + 0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U, + 0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU, + 0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U, + 0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U, + 0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU, + 0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU, + 0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU, + 0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U, + 0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U, + 0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU, + 0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U, + 0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU, + 0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U, + 0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU, + 0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U, + 0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU, + 0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU, + 0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U, + 0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU, + 0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U, + 0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU, + 0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U, + 0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U, + 0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U, + 0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU, + 0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU, + 0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U, + 0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU, + 0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U, + 0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU, + 0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U, + 0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU, + 0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U, + 0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU, + 0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U, + 0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU, + 0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U, + 0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U, + 0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU, + 0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U, + 0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U, + 0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U, + 0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU, + 0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U, + 0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U, + 0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU, + 0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U, + 0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U, + 0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U, + 0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU, + 0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU, + 0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U, + 0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU, + 0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU, + 0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U, + 0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU, + 0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U, + 0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU, + 0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U, + 0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U, + 0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U, + 0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU, + 0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U, + 0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU, + 0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U, + 0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU, + 0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U, + 0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U, + 0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU, + 0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU, + 0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU, + 0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U, + 0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U, + 0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU, + 0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U, + 0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU, + 0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U, + 0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU, + 0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U, + 0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU, + 0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU, + 0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U, + 0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU, + 0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U, + 0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU, + 0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U, + 0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U, + 0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U, + 0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU, + 0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU, + 0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U, + 0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU, + 0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U +}; + +#define t_fn0(x) (sharedMemory[ (x)]) +#define t_fn1(x) (sharedMemory[256 + (x)]) +#define t_fn2(x) (sharedMemory[512 + (x)]) +#define t_fn3(x) (sharedMemory[768 + (x)]) + +__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory) +{ + for(int i = threadIdx.x; i < 1024; i += blockDim.x) + sharedMemory[i] = d_t_fn[i]; +} + +)===" +R"===( + +template< typename T > +__forceinline__ __device__ void unusedVar( const T& ) +{ +} + +template +__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src) +{ +# if ( __CUDA_ARCH__ < 300 ) + ptr[sub] = val; + return ptr[src & (group_n-1)]; +# else + unusedVar( ptr ); + unusedVar( sub ); +# if (__CUDACC_VER_MAJOR__ >= 9) + return __shfl_sync(__activemask(), val, src, group_n); +# else + return __shfl( val, src, group_n ); +# endif +# endif +} + + +template +__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const uint64_t val,const uint32_t src, const uint32_t src2) +{ + uint64_t tmp; + ((uint32_t*)&tmp)[0] = shuffle(ptr, sub, static_cast(val), src); + ((uint32_t*)&tmp)[1] = shuffle(ptr, sub, static_cast(val >> 32), src2); + return tmp; +} + +struct u64 : public uint2 +{ + + __forceinline__ __device__ u64(){} + + __forceinline__ __device__ u64( const uint32_t x0, const uint32_t x1) + { + uint2::x = x0; + uint2::y = x1; + } + + __forceinline__ __device__ operator uint64_t() const + { + return *((uint64_t*)this); + } + + __forceinline__ __device__ u64( const uint64_t x0) + { + ((uint64_t*)&this->x)[0] = x0; + } + + __forceinline__ __device__ u64 operator^=(const u64& other) + { + uint2::x ^= other.x; + uint2::y ^= other.y; + + return *this; + } + + __forceinline__ __device__ u64 operator^=(const uint64_t& other) + { + uint2::x ^= static_cast(other); + uint2::y ^= static_cast(other >> 32); + + return *this; + } + + __forceinline__ __device__ u64 operator+(const u64& other) const + { + u64 tmp; + ((uint64_t*)&tmp.x)[0] = ((uint64_t*)&(this->x))[0] + ((uint64_t*)&(other.x))[0]; + + return tmp; + } + + __forceinline__ __device__ u64 operator+=(const uint64_t& other) + { + return ((uint64_t*)&this->x)[0] += other; + } +}; + +#ifdef RANDOM_MATH_64_BIT + +__device__ __forceinline__ static uint64_t rotate_left(uint64_t a, uint64_t b) +{ + const int shift = b & 63; + return (a << shift) | (a >> (64 - shift)); +} + +__device__ __forceinline__ static uint64_t rotate_right(uint64_t a, uint64_t b) +{ + const int shift = b & 63; + return (a >> shift) | (a << (64 - shift)); +} + +#else + +__device__ __forceinline__ static uint32_t rotate_left(uint32_t a, uint32_t b) { +#if __CUDA_ARCH__ < 350 + const uint32_t shift = b & 31; + return (a << shift) | (a >> (32 - shift)); +#else + return __funnelshift_l(a, a, b); +#endif +} +__device__ __forceinline__ static uint32_t rotate_right(uint32_t a, uint32_t b) { +#if __CUDA_ARCH__ < 350 + const uint32_t shift = b & 31; + return (a >> shift) | (a << (32 - shift)); +#else + return __funnelshift_r(a, a, b); +#endif +} + +#endif + +__global__ void CryptonightR_phase2( + const uint32_t ITERATIONS, + const size_t MEMORY, + const uint32_t MASK, + int threads, + int bfactor, + int partidx, + uint32_t *d_long_state, + uint32_t *d_ctx_a, + uint32_t *d_ctx_b, + uint32_t * d_ctx_state, + uint32_t startNonce, + uint32_t * __restrict__ d_input + ) +{ + __shared__ uint32_t sharedMemory[1024]; + + cn_aes_gpu_init( sharedMemory ); + +# if( __CUDA_ARCH__ < 300 ) + extern __shared__ uint64_t externShared[]; + // 8 x 64bit values + volatile uint64_t* myChunks = (volatile uint64_t*)(externShared + (threadIdx.x >> 1) * 8); + volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8) + (threadIdx.x & 0xFFFFFFFE); +# else + extern __shared__ uint64_t chunkMem[]; + volatile uint32_t* sPtr = NULL; + // 8 x 64bit values + volatile uint64_t* myChunks = (volatile uint64_t*)(chunkMem + (threadIdx.x >> 1) * 8); +# endif + + __syncthreads( ); + + const uint64_t tid = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = tid >> 1; + const uint32_t sub = tid & 1; + + if (thread >= threads) { + return; + } + + uint8_t *l0 = (uint8_t*)&d_long_state[((uint64_t)thread) * MEMORY]; + uint64_t ax0 = ((uint64_t*)(d_ctx_a + thread * 4))[sub]; + uint32_t idx0 = shuffle<2>(sPtr, sub, static_cast(ax0), 0); + uint64_t bx0 = ((uint64_t*)(d_ctx_b + thread * 16))[sub]; + uint64_t bx1 = ((uint64_t*)(d_ctx_b + thread * 16 + 4))[sub]; + + uint32_t r0 = d_ctx_b[thread * 16 + 4 * 2]; + uint32_t r1 = d_ctx_b[thread * 16 + 4 * 2 + 1]; + uint32_t r2 = d_ctx_b[thread * 16 + 4 * 2 + 2]; + uint32_t r3 = d_ctx_b[thread * 16 + 4 * 2 + 3]; + + const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); + const int start = partidx * batchsize; + const int end = start + batchsize; + + uint64_t* ptr0; + for (int i = start; i < end; ++i) { + ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; + + ((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub]; + + uint32_t idx1 = (idx0 & 0x30) >> 3; + const u64 cx = myChunks[ idx1 + sub ]; + const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ]; + + u64 cx_aes = ax0 ^ u64( + t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ), + t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) ) + ); + + { + const uint64_t chunk1 = myChunks[idx1 ^ 2 + sub]; + const uint64_t chunk2 = myChunks[idx1 ^ 4 + sub]; + const uint64_t chunk3 = myChunks[idx1 ^ 6 + sub]; + +#if(ALGO == cryptonight_r) + cx_aes ^= chunk1 ^ chunk2 ^ chunk3; +#endif + +#if (__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); +#else + __syncthreads(); +#endif + + myChunks[idx1 ^ 2 + sub] = chunk3 + bx1; + myChunks[idx1 ^ 4 + sub] = chunk1 + bx0; + myChunks[idx1 ^ 6 + sub] = chunk2 + ax0; + } + + myChunks[idx1 + sub] = cx_aes ^ bx0; + + ((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub]; + + idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0); + idx1 = (idx0 & 0x30) >> 3; + ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; + + ((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub]; + + uint64_t cx_mul; + ((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0); + ((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0); + + const uint32_t r4 = shuffle<2>(sPtr, sub, static_cast(ax0), 0); + const uint32_t r6 = shuffle<2>(sPtr, sub, static_cast(bx0), 0); + const uint32_t r7 = shuffle<2>(sPtr, sub, static_cast(bx1), 0); + + const uint64_t ax0_saved = ax0; + + if (sub == 1) + { + ((uint32_t*)&myChunks[idx1])[0] ^= r0 + r1; + ((uint32_t*)&myChunks[idx1])[1] ^= r2 + r3; + + const uint32_t r5 = static_cast(ax0); +#if(ALGO == cryptonight_r) + const uint32_t r8 = static_cast(bx1); +#endif + + XMRSTAK_INCLUDE_RANDOM_MATH + } + +#if(ALGO == cryptonight_r) + r0 = shuffle<2>(sPtr, sub, r0, 1); + r1 = shuffle<2>(sPtr, sub, r1, 1); + r2 = shuffle<2>(sPtr, sub, r2, 1); + r3 = shuffle<2>(sPtr, sub, r3, 1); + ax0 ^= (sub == 0) ? (r2 | ((uint64_t)(r3) << 32)) : (r0 | ((uint64_t)(r1) << 32)); +#endif + +#if (__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); +#else + __syncthreads( ); +#endif + + uint64_t c = ((uint64_t*)myChunks)[idx1 + sub]; + + { + uint64_t cl = ((uint64_t*)myChunks)[idx1]; + // sub 0 -> hi, sub 1 -> lo + uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl; + + const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] +#if(ALGO == cryptonight_r_wow) + ^ res +#endif + ; + uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; +#if(ALGO == cryptonight_r_wow) + res ^= ((uint64_t*)&chunk2)[0]; +#endif + const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ]; + +#if(ALGO == cryptonight_r) + cx_aes ^= chunk1 ^ chunk2 ^ chunk3; +#endif + +# if (__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); +# else + __syncthreads( ); +# endif + + myChunks[idx1 ^ 2 + sub] = chunk3 + bx1; + myChunks[idx1 ^ 4 + sub] = chunk1 + bx0; + myChunks[idx1 ^ 6 + sub] = chunk2 + ax0_saved; + + ax0 += res; + } + + bx1 = bx0; + bx0 = cx_aes; + + myChunks[idx1 + sub] = ax0; + + ((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub]; + + ax0 ^= c; + idx0 = shuffle<2>(sPtr, sub, static_cast(ax0), 0); + } + + if (bfactor > 0) + { + ((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0; + ((uint64_t*)(d_ctx_b + thread * 16))[sub] = bx0; + ((uint64_t*)(d_ctx_b + thread * 16 + 4))[sub] = bx1; + + if (sub == 1) + { + // must be valid only for `sub == 1` + d_ctx_b[thread * 16 + 4 * 2] = r0; + d_ctx_b[thread * 16 + 4 * 2 + 1] = r1; + d_ctx_b[thread * 16 + 4 * 2 + 2] = r2; + d_ctx_b[thread * 16 + 4 * 2 + 3] = r3; + } + } +} +)===" diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp index 563bb3b9e..96cb679f5 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp @@ -22,6 +22,18 @@ } \ ( (void) 0 ) +#define CU_CHECK(id, ...) { \ + CUresult result = __VA_ARGS__; \ + if(result != CUDA_SUCCESS){ \ + const char* s; \ + cuGetErrorString(result, &s); \ + std::cerr << "[CUDA] Error gpu " << id << ": <" << __FUNCTION__ << ">:" << __LINE__ << " \"" << (s ? s : "unknown error") << "\"" << std::endl; \ + throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(s ? s : "unknown error")); \ + } \ +} \ +( (void) 0 ) + + /** execute and check a CUDA api command * * @param id gpu id (thread id) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index 45afec9ac..7a9ccddc2 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -9,21 +9,6 @@ #include #include "xmrstak/jconf.hpp" -#ifdef __CUDACC__ -__constant__ -#else -const -#endif -uint64_t keccakf_rndc[24] ={ - 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, - 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, - 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, - 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, - 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, - 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, - 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, - 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 -}; typedef unsigned char BitSequence; typedef unsigned long long DataLength; @@ -108,7 +93,7 @@ __device__ __forceinline__ void mix_and_propagate( uint32_t* state ) (state + 4 * 7)[x] = (state + 4 * 7)[x] ^ tmp0[x]; } -template +template __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_state2, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 ) { int thread = ( blockDim.x * blockIdx.x + threadIdx.x ); @@ -144,14 +129,23 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 ); if(ALGO == cryptonight_monero_v8) { - memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 ); + memcpy( d_ctx_b + thread * 16, ctx_b, 4 * 4 ); // bx1 XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b ); - memcpy( d_ctx_b + thread * 12 + 4, ctx_b, 4 * 4 ); + memcpy( d_ctx_b + thread * 16 + 4, ctx_b, 4 * 4 ); // division_result - memcpy( d_ctx_b + thread * 12 + 2 * 4, ctx_state + 24, 4 * 2 ); + memcpy( d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 2 ); // sqrt_result - memcpy( d_ctx_b + thread * 12 + 2 * 4 + 2, ctx_state + 26, 4 * 2 ); + memcpy( d_ctx_b + thread * 16 + 2 * 4 + 2, ctx_state + 26, 4 * 2 ); + } + else if(ALGO == cryptonight_r_wow || ALGO == cryptonight_r) + { + memcpy(d_ctx_b + thread * 16, ctx_b, 4 * 4); + // bx1 + XOR_BLOCKS_DST(ctx_state + 16, ctx_state + 20, ctx_b); + memcpy(d_ctx_b + thread * 16 + 4, ctx_b, 4 * 4); + // r0, r1, r2, r3 + memcpy(d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 8); } else memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); @@ -177,14 +171,15 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric } } -template +template __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 ) { const int thread = blockDim.x * blockIdx.x + threadIdx.x; __shared__ uint32_t sharedMemory[1024]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { cn_aes_gpu_init( sharedMemory ); __syncthreads( ); @@ -201,7 +196,8 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 for ( i = 0; i < 50; i++ ) state[i] = ctx_state[i]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) + if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { uint32_t key[40]; @@ -220,33 +216,46 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 } cn_keccakf2( (uint64_t *) state ); - switch ( ( (uint8_t *) state )[0] & 0x03 ) + if(ALGO == cryptonight_gpu) { - case 0: - cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash ); - break; - case 1: - cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash ); - break; - case 2: - cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash ); - break; - case 3: - cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash ); - break; - default: - break; + if ( ((uint64_t*)state)[3] < target ) + { + uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); + + if(idx < 10) + d_res_nonce[idx] = thread; + } } + else + { + switch ( ( (uint8_t *) state )[0] & 0x03 ) + { + case 0: + cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash ); + break; + case 1: + cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash ); + break; + case 2: + cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash ); + break; + case 3: + cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash ); + break; + default: + break; + } - // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values - // and expect an accurate result for target > 32-bit without implementing carries + // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values + // and expect an accurate result for target > 32-bit without implementing carries - if ( hash[3] < target ) - { - uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); + if ( hash[3] < target ) + { + uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); - if(idx < 10) - d_res_nonce[idx] = thread; + if(idx < 10) + d_res_nonce[idx] = thread; + } } } @@ -258,6 +267,9 @@ extern "C" void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) { + CU_CHECK(ctx->device_id, cuDeviceGet(&ctx->cuDevice, ctx->device_id)); + CU_CHECK(ctx->device_id, cuCtxCreate(&ctx->cuContext, 0, ctx->cuDevice)); + cudaError_t err; err = cudaSetDevice(ctx->device_id); if(err != cudaSuccess) @@ -287,19 +299,22 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) // prefer shared memory over L1 cache CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)); - size_t hashMemSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + + size_t hashMemSize = 0; + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, algo.Mem()); + } size_t wsize = ctx->device_blocks * ctx->device_threads; CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize)); size_t ctx_b_size = 4 * sizeof(uint32_t) * wsize; if( - cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end() ) { // extent ctx_b to hold the state of idx0 @@ -307,11 +322,22 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) // create a double buffer for the state to exchange the mixed state to phase1 CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize)); } - else if(cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) + else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_conceal) != neededAlgorithms.end()) + { + ctx_b_size += sizeof(uint32_t) * 4 * wsize; + } + else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) { - // bx1 (16byte), division_result (8byte) and sqrt_result (8byte) - ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize; + // bx0 (16byte), bx1 (16byte), division_result (8byte) and sqrt_result (8byte), padding (16byte) + ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize; + } + else if( + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end() + ) + { + // bx0 (16byte), bx1 (16byte), and [r0, r1, r2, r3] (a 8byte) + ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize; } else ctx->d_ctx_state2 = ctx->d_ctx_state; @@ -332,7 +358,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) return 1; } -extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, xmrstak_algo miner_algo) +extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo) { int threadsperblock = 128; uint32_t wsize = ctx->device_blocks * ctx->device_threads; @@ -360,11 +386,26 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); } - if(miner_algo == cryptonight_monero_v8) + else if(miner_algo == cryptonight_monero_v8) { CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); } + else if(miner_algo == cryptonight_gpu) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } + else if(miner_algo == cryptonight_r) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } + else if(miner_algo == cryptonight_r_wow) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } else { /* pass two times d_ctx_state because the second state is used later in phase1, @@ -375,7 +416,7 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce } } -extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce,xmrstak_algo miner_algo) +extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo) { int threadsperblock = 128; uint32_t wsize = ctx->device_blocks * ctx->device_threads; @@ -409,7 +450,7 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) ); - } + } else if(miner_algo == cryptonight_bittube2) { CUDA_CHECK_MSG_KERNEL( @@ -418,6 +459,15 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) ); } + else if(miner_algo == cryptonight_gpu) + { + // fallback for all other algorithms + CUDA_CHECK_MSG_KERNEL( + ctx->device_id, + "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", + cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) + ); + } else { // fallback for all other algorithms @@ -571,6 +621,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) } } + auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); + bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end(); + + // set all device option those marked as auto (-1) to a valid value if(ctx->device_blocks == -1) { @@ -578,8 +632,11 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) * - 3 * SMX count for >=sm_30 * - 2 * SMX count for device_blocks = props.multiProcessorCount * - ( props.major < 3 ? 2 : 3 ); + ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 2 : 3); + + // use 6 blocks per SM for sm_2X else 8 blocks + if(useCryptonight_gpu) + ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 6 : 8); // increase bfactor for low end devices to avoid that the miner is killed by the OS if(props.multiProcessorCount <= 6) @@ -591,7 +648,16 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) * `cryptonight_core_gpu_phase1` and `cryptonight_core_gpu_phase3` starts * `8 * ctx->device_threads` threads per block */ - ctx->device_threads = 64; + const uint32_t maxThreadsPerBlock = props.major < 3 ? 512 : 1024; + + // for the most algorithms we are using 8 threads per hash + uint32_t threadsPerHash = 8; + + // phase2_gpu uses 16 threads per hash + if(useCryptonight_gpu) + threadsPerHash = 16; + + ctx->device_threads = maxThreadsPerBlock / threadsPerHash; constexpr size_t byteToMiB = 1024u * 1024u; // no limit by default 1TiB @@ -656,10 +722,11 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ctx->total_device_memory = totalMemory; ctx->free_device_memory = freeMemory; - size_t hashMemSize = std::max( - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()), - cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()) - ); + size_t hashMemSize = 0; + for(const auto algo : neededAlgorithms) + { + hashMemSize = std::max(hashMemSize, algo.Mem()); + } #ifdef WIN32 /* We use in windows bfactor (split slow kernel into smaller parts) to avoid @@ -688,10 +755,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) // 680bytes are extra meta data memory per hash size_t perThread = hashMemSize + 16192u + 680u; if( - cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() || + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end() ) perThread += 50 * 4; // state double buffer @@ -700,19 +767,18 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) // use only odd number of threads ctx->device_threads = ctx->device_threads & 0xFFFFFFFE; - if(props.major == 2 && ctx->device_threads > 64) + if(ctx->device_threads > maxThreadsPerBlock / threadsPerHash) { - // Fermi gpus only support 512 threads per block (we need start 4 * configured threads) - ctx->device_threads = 64; + ctx->device_threads = maxThreadsPerBlock / threadsPerHash; } // check if cryptonight_monero_v8 is selected for the user pool - bool useCryptonight_v8 = - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 || - ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8; + bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()); + bool useCryptonight_r = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r) != neededAlgorithms.end()); + bool useCryptonight_r_wow = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end()); // overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0 - if(useCryptonight_v8 && gpuArch >= 50) + if((useCryptonight_v8 || useCryptonight_r || useCryptonight_r_wow) && gpuArch >= 50) { // 4 based on my test maybe it must be adjusted later size_t threads = 4; @@ -725,6 +791,28 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) ctx->device_blocks = blockOptimal; } } + else if(useCryptonight_gpu) + { + // 8 based on my profiling sessions maybe it must be adjusted later + size_t threads = 8; + // 8 is chosen by checking the occupancy calculator + size_t blockOptimal = 8 * ctx->device_mpcount; + + // the following values are calculated with CUDA10 and the occupancy calculator + if(gpuArch == 35 || gpuArch/10 == 5 || gpuArch/10 == 6) + blockOptimal = 7 * ctx->device_mpcount; + if(gpuArch == 37) + blockOptimal = 14 * ctx->device_mpcount; + if(gpuArch >= 70) + blockOptimal = 6 * ctx->device_mpcount; + + if(blockOptimal * threads * hashMemSize < limitedMemory) + { + ctx->device_threads = threads; + ctx->device_blocks = blockOptimal; + } + + } } printf("device init succeeded\n"); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp index 99c651645..c75c74964 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp @@ -1,3 +1,23 @@ +#pragma once + +#include "cuda_extra.hpp" + +#ifdef __CUDACC__ +__constant__ +#else +const +#endif +uint64_t keccakf_rndc[24] ={ + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 +}; + #if __CUDA_ARCH__ >= 350 __forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset) { diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp index c57416f2e..5a8a51703 100644 --- a/xmrstak/cli/cli-miner.cpp +++ b/xmrstak/cli/cli-miner.cpp @@ -811,7 +811,7 @@ int main(int argc, char *argv[]) printer::inst()->print_str("This currency is a way for us to implement the ideas that we were unable to in\n"); printer::inst()->print_str("Monero. See https://github.com/fireice-uk/cryptonote-speedup-demo for details.\n"); printer::inst()->print_str("-------------------------------------------------------------------\n"); - printer::inst()->print_msg(L0, "Mining coin: %s", jconf::inst()->GetMiningCoin().c_str()); + printer::inst()->print_msg(L0, "Mining coin: %s", ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo().Name().c_str()); if(params::inst().benchmark_block_version >= 0) { @@ -875,13 +875,12 @@ int do_benchmark(int block_version, int wait_sec, int work_sec) /* AMD and NVIDIA is currently only supporting work sizes up to 84byte * \todo fix this issue */ - xmrstak::miner_work benchWork = xmrstak::miner_work("", work, 84, 0, false, 0); printer::inst()->print_msg(L0, "Start a %d second benchmark...",work_sec); - xmrstak::globalStates::inst().switch_work(benchWork, dat); + xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 84, 0, false, 0, 0), dat); uint64_t iStartStamp = get_timestamp_ms(); std::this_thread::sleep_for(std::chrono::seconds(work_sec)); - xmrstak::globalStates::inst().switch_work(oWork, dat); + xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 84, 0, false, 0, 0), dat); double fTotalHps = 0.0; for (uint32_t i = 0; i < pvThreads->size(); i++) diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl index 73ae054c2..d8fd861a7 100644 --- a/xmrstak/config.tpl +++ b/xmrstak/config.tpl @@ -25,15 +25,16 @@ R"===(// generated by XMRSTAK_VERSION * performance monitors, there is very little reason to spew out pages of text instead of concise reports. * Press 'h' (hashrate), 'r' (results) or 'c' (connection) to print reports. * - * verbose_level - 0 - Don't print anything. - * 1 - Print intro, connection event, disconnect event - * 2 - All of level 1, and new job (block) event if the difficulty is different from the last job - * 3 - All of level 1, and new job (block) event in all cases, result submission event. - * 4 - All of level 3, and automatic hashrate report printing + * verbose_level - 0 - Don't print anything. + * 1 - Print intro, connection event, disconnect event + * 2 - All of level 1, and new job (block) event if the difficulty is different from the last job + * 3 - All of level 1, and new job (block) event in all cases, result submission event. + * 4 - All of level 3, and automatic hashrate report printing + * 10 - Debug level for developer * * print_motd - Display messages from your pool operator in the hashrate result. */ -"verbose_level" : 3, +"verbose_level" : 4, "print_motd" : true, /* @@ -42,7 +43,7 @@ R"===(// generated by XMRSTAK_VERSION * h_print_time - How often, in seconds, should we print a hashrate report if verbose_level is set to 4. * This option has no effect if verbose_level is not 4. */ -"h_print_time" : 60, +"h_print_time" : 300, /* * Manual hardware AES override diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp index 93e217519..8f20078aa 100644 --- a/xmrstak/http/webdesign.cpp +++ b/xmrstak/http/webdesign.cpp @@ -157,7 +157,7 @@ extern const char sHtmlHashrateBodyHigh [] = "Thread ID10s60s15mH/s"; extern const char sHtmlHashrateTableRow [] = - "%u%s%s%s"; + "%s%s%s%s"; extern const char sHtmlHashrateBodyLow [] = "Totals:%s%s%s" @@ -168,6 +168,7 @@ extern const char sHtmlHashrateBodyLow [] = extern const char sHtmlConnectionBodyHigh [] = "
" "" + "" "" "" "" @@ -185,6 +186,7 @@ extern const char sHtmlConnectionBodyLow [] = extern const char sHtmlResultBodyHigh [] = "
" "
Rig ID%s
Pool address%s
Connected since%s
Pool ping time%u ms
" + "" "" "" "" diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 2a2dc8dbc..e60420234 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -87,31 +87,39 @@ constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); xmrstak::coin_selection coins[] = { // name, userpool, devpool, default_pool_suggestion - { "aeon7", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, "mine.aeon-pool.com:5555" }, - { "bbscoin", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, - { "bittube", {cryptonight_heavy, cryptonight_bittube2, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"}, - { "cryptonight", {cryptonight_monero_v8, cryptonight, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "cryptonight_bittube2",{cryptonight_heavy, cryptonight_bittube2, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u},nullptr}, - { "cryptonight_masari", {cryptonight_monero_v8, cryptonight_masari, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr }, - { "cryptonight_haven", {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "cryptonight_heavy", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "cryptonight_lite", {cryptonight_aeon, cryptonight_lite, 255u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, - { "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, - { "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, - { "cryptonight_superfast", {cryptonight_heavy, cryptonight_superfast, 255u},{cryptonight_heavy, cryptonight_superfast, 0u}, nullptr }, - { "cryptonight_v7", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "cryptonight_v8", {cryptonight_monero_v8, cryptonight_monero_v8, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "freehaven", {cryptonight_heavy, cryptonight_superfast, 255u}, {cryptonight_heavy, cryptonight_superfast, 0u}, nullptr }, - { "graft", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "haven", {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "intense", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "masari", {cryptonight_monero_v8, cryptonight_masari, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr }, - { "monero", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, "pool.usxmrpool.com:3333" }, - { "qrl", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "turtlecoin", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr } + { "aeon7", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, "mine.aeon-pool.com:5555" }, + { "bbscoin", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr }, + { "bittube", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, "mining.bit.tube:13333" }, + { "cryptonight", {POW(cryptonight)}, {POW(cryptonight_gpu)}, nullptr }, + { "cryptonight_bittube2", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, nullptr }, + { "cryptonight_masari", {POW(cryptonight_masari)}, {POW(cryptonight_gpu)}, nullptr }, + { "cryptonight_haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr }, + { "cryptonight_heavy", {POW(cryptonight_heavy)}, {POW(cryptonight_gpu)}, nullptr }, + { "cryptonight_lite", {POW(cryptonight_lite)}, {POW(cryptonight_aeon)}, nullptr }, + { "cryptonight_lite_v7", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr }, + { "cryptonight_lite_v7_xor", {POW(cryptonight_ipbc)}, {POW(cryptonight_aeon)}, nullptr }, + { "cryptonight_r", {POW(cryptonight_r)}, {POW(cryptonight_r),10,POW(cryptonight_monero_v8)}, nullptr }, + { "cryptonight_superfast", {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr }, + { "cryptonight_turtle", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr }, + { "cryptonight_v7", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr }, + { "cryptonight_v8", {POW(cryptonight_monero_v8)}, {POW(cryptonight_r),10,POW(cryptonight_monero_v8)}, nullptr }, + { "cryptonight_v8_half", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr }, + { "cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)},{POW(cryptonight_gpu)}, nullptr }, + { "cryptonight_v7_stellite", {POW(cryptonight_stellite)}, {POW(cryptonight_gpu)}, nullptr }, + { "cryptonight_gpu", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333" }, + { "cryptonight_conceal", {POW(cryptonight_conceal)}, {POW(cryptonight_gpu)}, nullptr }, + { "freehaven", {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr }, + { "graft", {POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr }, + { "haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr }, + { "lethean", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr }, + { "masari", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr }, + { "monero", {POW(cryptonight_r),10,POW(cryptonight_monero_v8)}, {POW(cryptonight_r),10,POW(cryptonight_monero_v8)}, "pool.usxmrpool.com:3333" }, + { "qrl", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr }, + { "ryo", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333" }, + { "stellite", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr }, + { "turtlecoin", {POW(cryptonight_turtle), 6u,POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr }, + { "plenteum", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr }, + { "zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr } }; constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0])); diff --git a/xmrstak/misc/coinDescription.hpp b/xmrstak/misc/coinDescription.hpp index 55e86f4e2..65dee143c 100644 --- a/xmrstak/misc/coinDescription.hpp +++ b/xmrstak/misc/coinDescription.hpp @@ -4,19 +4,24 @@ #include #include - +#include +#include namespace xmrstak { struct coinDescription { - xmrstak_algo algo = xmrstak_algo::invalid_algo; - xmrstak_algo algo_root = xmrstak_algo::invalid_algo; + xmrstak_algo algo = {xmrstak_algo_id::invalid_algo}; uint8_t fork_version = 0u; + xmrstak_algo algo_root = {xmrstak_algo_id::invalid_algo}; coinDescription() = default; - coinDescription(const xmrstak_algo in_algo, xmrstak_algo in_algo_root, const uint8_t in_fork_version) : + coinDescription( + const xmrstak_algo in_algo, + const uint8_t in_fork_version = 0, + xmrstak_algo in_algo_root = xmrstak_algo_id::invalid_algo + ) : algo(in_algo), algo_root(in_algo_root), fork_version(in_fork_version) {} @@ -56,5 +61,27 @@ namespace xmrstak coinDescription tmp = (poolId == 0 ? pool_coin[1] : pool_coin[0]); return tmp; } + + /** return all POW algorithm for the current selected currency + * + * @return required POW algorithms without duplicated entries + */ + inline std::vector GetAllAlgorithms() + { + std::vector allAlgos = { + GetDescription(0).GetMiningAlgo(), + GetDescription(0).GetMiningAlgoRoot(), + GetDescription(1).GetMiningAlgo(), + GetDescription(1).GetMiningAlgoRoot() + }; + + std::sort(allAlgos.begin(), allAlgos.end()); + std::remove(allAlgos.begin(), allAlgos.end(), invalid_algo); + auto last = std::unique(allAlgos.begin(), allAlgos.end()); + // remove duplicated algorithms + allAlgos.erase(last, allAlgos.end()); + + return allAlgos; + } }; } // namespace xmrstak diff --git a/xmrstak/misc/console.hpp b/xmrstak/misc/console.hpp index 5d78772c3..6df6597c6 100644 --- a/xmrstak/misc/console.hpp +++ b/xmrstak/misc/console.hpp @@ -21,7 +21,7 @@ inline long long unsigned int int_port(size_t i) return i; } -enum verbosity : size_t { L0 = 0, L1 = 1, L2 = 2, L3 = 3, L4 = 4, LINF = 100}; +enum verbosity : size_t { L0 = 0, L1 = 1, L2 = 2, L3 = 3, L4 = 4, LDEBUG = 10, LINF = 100}; class printer { diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index c99c76d93..d3af4048b 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -124,9 +124,8 @@ bool executor::get_live_pools(std::vector& eval_pools, bool is_dev) if(xmrstak::globalStates::inst().pool_id != invalid_pool_id) { printer::inst()->print_msg(L0, "All pools are dead. Idling..."); - auto work = xmrstak::miner_work(); xmrstak::pool_data dat; - xmrstak::globalStates::inst().switch_work(work, dat); + xmrstak::globalStates::inst().switch_work(xmrstak::miner_work(), dat); } if(over_limit == pool_count) @@ -364,13 +363,12 @@ void executor::on_pool_have_job(size_t pool_id, pool_job& oPoolJob) jpsock* pool = pick_pool_by_id(pool_id); - xmrstak::miner_work oWork(oPoolJob.sJobID, oPoolJob.bWorkBlob, oPoolJob.iWorkLen, oPoolJob.iTarget, pool->is_nicehash(), pool_id); - xmrstak::pool_data dat; dat.iSavedNonce = oPoolJob.iSavedNonce; dat.pool_id = pool_id; - xmrstak::globalStates::inst().switch_work(oWork, dat); + xmrstak::globalStates::inst().switch_work(xmrstak::miner_work(oPoolJob.sJobID, oPoolJob.bWorkBlob, + oPoolJob.iWorkLen, oPoolJob.iTarget, pool->is_nicehash(), pool_id, oPoolJob.iBlockHeight), dat); if(dat.pool_id != pool_id) { @@ -445,7 +443,7 @@ void executor::on_miner_result(size_t pool_id, job_result& oResult) if(bResult) { uint64_t* targets = (uint64_t*)oResult.bResult; - log_result_ok(jpsock::t64_to_diff(targets[3])); + log_result_ok(t64_to_diff(targets[3])); printer::inst()->print_msg(L3, "Result accepted by the pool."); } else @@ -555,34 +553,34 @@ void executor::ex_main() { case cryptonight_heavy: if(dev_tls) - pools.emplace_front(0, "donate.xmr-stak.net:8888", "", "", "", 0.0, true, true, "", true); + pools.emplace_front(0, "pool.loki.hashvault.pro:443", "L7tapzgnQ4oN9CkUfS2oyiLbrfDPWoxycZMJUpN5VvxdX4s4hPQv8Ja5YHnwGwYCib3Jp9agD28tucz6viPQeHqqR49KPHG", "", "hide", 0.0, true, true, "", false); else pools.emplace_front(0, "pool.loki.hashvault.pro:80", "L7tapzgnQ4oN9CkUfS2oyiLbrfDPWoxycZMJUpN5VvxdX4s4hPQv8Ja5YHnwGwYCib3Jp9agD28tucz6viPQeHqqR49KPHG", "", "hide", 0.0, true, false, "", false); break; + case cryptonight_gpu: + if(dev_tls) + pools.emplace_front(0, "donate.xmr-stak.net:8811", "", "", "", 0.0, true, true, "", false); + else + pools.emplace_front(0, "donate.xmr-stak.net:5511", "", "", "", 0.0, true, false, "", false); + break; case cryptonight_monero_v8: - case cryptonight_monero: + case cryptonight_r: if(dev_tls) pools.emplace_front(0, "pool.supportxmr.com:9000", "47CQgrYtLWf4LnwrFLzmfTAp4VQbr5YjmXxJuuKw6Feujjn8c4HrkWpHAtyi6eGfkcZtj1Xig4EXPAS8vzq6CUq4DhiBjyb", "", "hide", 0.0, true, true, "", false); else pools.emplace_front(0, "pool.supportxmr.com:5555", "47CQgrYtLWf4LnwrFLzmfTAp4VQbr5YjmXxJuuKw6Feujjn8c4HrkWpHAtyi6eGfkcZtj1Xig4EXPAS8vzq6CUq4DhiBjyb", "","hide",0.0, true, false, "", false); break; - case cryptonight_ipbc: case cryptonight_aeon: - case cryptonight_lite: if(dev_tls) - pools.emplace_front(0, "donate.xmr-stak.net:7777", "", "", "", 0.0, true, true, "", true); + pools.emplace_front(0, "pool.aeon.hashvault.pro:443", "WmszXjHu7CKC3r7tSbSG8tMzSUKVvMw3HNgDiaH3hD1B7iUTJ6tH4Vpa4jBBtgAJzTJvKSsd5Jst86ybtdBewMkq1fUosyjta", "", "hide", 0.0, true, true, "", false); else pools.emplace_front(0, "pool.aeon.hashvault.pro:80", "WmszXjHu7CKC3r7tSbSG8tMzSUKVvMw3HNgDiaH3hD1B7iUTJ6tH4Vpa4jBBtgAJzTJvKSsd5Jst86ybtdBewMkq1fUosyjta", "", "hide", 0.0, true, false, "", false); break; - - case cryptonight: + default: if(dev_tls) pools.emplace_front(0, "donate.xmr-stak.net:6666", "", "", "", 0.0, true, true, "", false); else - pools.emplace_front(0, "pool.electroneum.hashvault.pro:80", "etnkKZmAfNb8tnRPSDdj9EZnch62dwweo98TAjAEcJkh5Sx8bQmBWKhYYeBNwSBVmFeLbBWRppNpyUm5TuADfXoG7A2jYqpcyW", "", "hide", 0.0, true, false, "", false); - break; - - default: + pools.emplace_front(0, "donate.xmr-stak.net:3333", "", "", "", 0.0, true, false, "", false); break; } @@ -883,6 +881,8 @@ void executor::result_report(std::string& out) iTotalRes += vMineResults[i].count; out.append("RESULT REPORT\n"); + out.append("Currency : "). + append(jconf::inst()->GetMiningCoin()).append("\n"); if(iTotalRes == 0) { out.append("You haven't found any results yet.\n"); @@ -944,6 +944,7 @@ void executor::connection_report(std::string& out) pool = pick_pool_by_id(last_usr_pool_id); out.append("CONNECTION REPORT\n"); + out.append("Rig ID : ").append(pool != nullptr ? pool->get_rigid() : "").append(1, '\n'); out.append("Pool address : ").append(pool != nullptr ? pool->get_pool_addr() : "").append(1, '\n'); if(pool != nullptr && pool->is_running() && pool->is_logged_in()) out.append("Connected since : ").append(time_format(date, sizeof(date), tPoolConnTime)).append(1, '\n'); @@ -1039,9 +1040,27 @@ void executor::http_hashrate_report(std::string& out) out.append(buffer); double fTotal[3] = { 0.0, 0.0, 0.0}; + auto bTypePrev = static_cast(0); + std::string name; + size_t j = 0; for(size_t i=0; i < nthd; i++) { double fHps[3]; + char csThreadTag[25]; + auto bType = static_cast(pvThreads->at(i)->backendType); + if(bTypePrev == bType) + j++; + else + { + j = 0; + bTypePrev = bType; + name = xmrstak::iBackend::getName(bType); + std::transform(name.begin(), name.end(), name.begin(), ::toupper); + } + snprintf(csThreadTag, sizeof(csThreadTag), + (99 < nthd) ? "[%s.%03u]:%03u" : ((9 < nthd) ? "[%s.%02u]:%02u" : "[%s.%u]:%u"), + name.c_str(), (unsigned int)(j), (unsigned int)i + ); fHps[0] = telem->calc_telemetry_data(10000, i); fHps[1] = telem->calc_telemetry_data(60000, i); @@ -1056,7 +1075,7 @@ void executor::http_hashrate_report(std::string& out) fTotal[1] += fHps[1]; fTotal[2] += fHps[2]; - snprintf(buffer, sizeof(buffer), sHtmlHashrateTableRow, (unsigned int)i, num_a, num_b, num_c); + snprintf(buffer, sizeof(buffer), sHtmlHashrateTableRow, csThreadTag, num_a, num_b, num_c); out.append(buffer); } @@ -1144,6 +1163,7 @@ void executor::http_connection_report(std::string& out) } snprintf(buffer, sizeof(buffer), sHtmlConnectionBodyHigh, + jconf::inst()->GetMiningCoin().c_str(), pool != nullptr ? pool->get_pool_addr() : "not connected", cdate, ping_time); out.append(buffer); diff --git a/xmrstak/misc/jext.hpp b/xmrstak/misc/jext.hpp index f4a333c22..9936fa813 100644 --- a/xmrstak/misc/jext.hpp +++ b/xmrstak/misc/jext.hpp @@ -14,3 +14,49 @@ inline const Value* GetObjectMember(const Value& obj, const char* key) else return nullptr; } + +#ifdef _MSC_VER + +#include +#define bswap_32(x) _byteswap_ulong(x) +#define bswap_64(x) _byteswap_uint64(x) + +#elif defined(__APPLE__) + +// Mac OS X / Darwin features +#include +#define bswap_32(x) OSSwapInt32(x) +#define bswap_64(x) OSSwapInt64(x) + +#elif defined(__sun) || defined(sun) + +#include +#define bswap_32(x) BSWAP_32(x) +#define bswap_64(x) BSWAP_64(x) + +#elif defined(__FreeBSD__) + +#include +#define bswap_32(x) bswap32(x) +#define bswap_64(x) bswap64(x) + +#elif defined(__OpenBSD__) + +#include +#define bswap_32(x) swap32(x) +#define bswap_64(x) swap64(x) + +#elif defined(__NetBSD__) + +#include +#include +#if defined(__BSWAP_RENAME) && !defined(__bswap_32) +#define bswap_32(x) bswap32(x) +#define bswap_64(x) bswap64(x) +#endif + +#else + +#include + +#endif diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp index 406c535d2..786b18b4f 100644 --- a/xmrstak/net/jpsock.cpp +++ b/xmrstak/net/jpsock.cpp @@ -403,11 +403,12 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message if (!params->val->IsObject()) return set_socket_error("PARSE error: Job error 1"); - const Value *blob, *jobid, *target, *motd; + const Value *blob, *jobid, *target, *motd, *blk_height; jobid = GetObjectMember(*params->val, "job_id"); blob = GetObjectMember(*params->val, "blob"); target = GetObjectMember(*params->val, "target"); motd = GetObjectMember(*params->val, "motd"); + blk_height = GetObjectMember(*params->val, "height"); if (jobid == nullptr || blob == nullptr || target == nullptr || !jobid->IsString() || !blob->IsString() || !target->IsString()) @@ -445,10 +446,8 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message // lock reading of oCurrentJob std::unique_lock jobIdLock(job_mutex); // compare possible non equal length job id's - if(iWorkLen == oCurrentJob.iWorkLen && - memcmp(oPoolJob.bWorkBlob, oCurrentJob.bWorkBlob, iWorkLen) == 0 && - strcmp(jobid->GetString(), oCurrentJob.sJobID) == 0 - ) + if(iWorkLen == oCurrentJob.iWorkLen && memcmp(oPoolJob.bWorkBlob, oCurrentJob.bWorkBlob, iWorkLen) == 0 && + strcmp(jobid->GetString(), oCurrentJob.sJobID) == 0) { return set_socket_error("Duplicate equal job detected! Please contact your pool admin."); } @@ -466,7 +465,6 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message if(!hex2bin(sTempStr, 8, (unsigned char*)&iTempInt) || iTempInt == 0) return set_socket_error("PARSE error: Invalid target"); - oPoolJob.iTarget = t32_to_t64(iTempInt); } else if(target_slen <= 16) @@ -481,6 +479,9 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message return set_socket_error("PARSE error: Job error 5"); iJobDiff = t64_to_diff(oPoolJob.iTarget); + + if(blk_height != nullptr && blk_height->IsUint64()) + oPoolJob.iBlockHeight = bswap_64(blk_height->GetUint64()); std::unique_lock lck(job_mutex); oCurrentJob = oPoolJob; @@ -655,13 +656,17 @@ bool jpsock::cmd_login() return true; } -bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, xmrstak_algo algo) +bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, const xmrstak_algo& algo) { char cmd_buffer[1024]; char sNonce[9]; char sResult[65]; /*Extensions*/ char sAlgo[64] = {0}; + char sBaseAlgo[64] = {0}; + char sIterations[32] = {0}; + char sMemory[32] = {0}; + char sMemAlignBytes[32] = {0}; char sBackend[64] = {0}; char sHashcount[128] = {0}; @@ -673,48 +678,12 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes if(ext_algo) { - const char* algo_name; - switch(algo) - { - case cryptonight: - algo_name = "cryptonight"; - break; - case cryptonight_lite: - algo_name = "cryptonight_lite"; - break; - case cryptonight_monero: - algo_name = "cryptonight_v7"; - break; - case cryptonight_monero_v8: - algo_name = "cryptonight_v8"; - break; - case cryptonight_aeon: - algo_name = "cryptonight_lite_v7"; - break; - case cryptonight_stellite: - algo_name = "cryptonight_v7_stellite"; - break; - case cryptonight_ipbc: - algo_name = "cryptonight_lite_v7_xor"; - break; - case cryptonight_heavy: - algo_name = "cryptonight_heavy"; - break; - case cryptonight_haven: - algo_name = "cryptonight_haven"; - break; - case cryptonight_masari: - algo_name = "cryptonight_masari"; - break; - case cryptonight_superfast: - algo_name = "cryptonight_superfast"; - break; - default: - algo_name = "unknown"; - break; - } - - snprintf(sAlgo, sizeof(sAlgo), ",\"algo\":\"%s\"", algo_name); + snprintf(sAlgo, sizeof(sAlgo), ",\"algo\":\"%s\"", algo.Name().c_str()); + // the real algorithm with three degrees of freedom + snprintf(sBaseAlgo, sizeof(sBaseAlgo), ",\"base_algo\":\"%s\"", algo.BaseName().c_str()); + snprintf(sIterations, sizeof(sIterations), ",\"iterations\":\"0x%08x\"", algo.Iter()); + snprintf(sMemory, sizeof(sMemory), ",\"scratchpad\":\"0x%08x\"", (uint32_t)algo.Mem()); + snprintf(sMemAlignBytes, sizeof(sMemAlignBytes), ",\"mask\":\"0x%08x\"", algo.Mask()); } bin2hex((unsigned char*)&iNonce, 4, sNonce); @@ -723,8 +692,8 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes bin2hex(bResult, 32, sResult); sResult[64] = '\0'; - snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s},\"id\":1}\n", - sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo); + snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s%s%s%s%s},\"id\":1}\n", + sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sIterations,sMemory, sMemAlignBytes); uint64_t messageId = 0; opq_json_val oResult(nullptr); diff --git a/xmrstak/net/jpsock.hpp b/xmrstak/net/jpsock.hpp index ad34f6c86..949764813 100644 --- a/xmrstak/net/jpsock.hpp +++ b/xmrstak/net/jpsock.hpp @@ -35,7 +35,7 @@ class jpsock void disconnect(bool quiet = false); bool cmd_login(); - bool cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, xmrstak_algo algo); + bool cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, const xmrstak_algo& algo); static bool hex2bin(const char* in, unsigned int len, unsigned char* out); static void bin2hex(const unsigned char* in, unsigned int len, char* out); @@ -58,6 +58,7 @@ class jpsock inline bool get_disconnects(size_t& att, size_t& time) { att = connect_attempts; time = disconnect_time != 0 ? get_timestamp() - disconnect_time + 1 : 0; return pool && usr_login[0]; } inline const char* get_pool_addr() { return net_addr.c_str(); } inline const char* get_tls_fp() { return tls_fp.c_str(); } + inline const char* get_rigid() { return usr_rigid.c_str(); } inline bool is_nicehash() { return nicehash; } bool get_pool_motd(std::string& strin); @@ -65,11 +66,6 @@ class jpsock std::string&& get_call_error(); bool have_call_error() { return call_error; } bool have_sock_error() { return bHaveSocketError; } - - inline static uint64_t t32_to_t64(uint32_t t) { return 0xFFFFFFFFFFFFFFFFULL / (0xFFFFFFFFULL / ((uint64_t)t)); } - inline static uint64_t t64_to_diff(uint64_t t) { return 0xFFFFFFFFFFFFFFFFULL / t; } - inline static uint64_t diff_to_t64(uint64_t d) { return 0xFFFFFFFFFFFFFFFFULL / d; } - inline uint64_t get_current_diff() { return iJobDiff; } void save_nonce(uint32_t nonce); diff --git a/xmrstak/net/msgstruct.hpp b/xmrstak/net/msgstruct.hpp index 6a05eb9d5..813fc7d06 100644 --- a/xmrstak/net/msgstruct.hpp +++ b/xmrstak/net/msgstruct.hpp @@ -16,6 +16,7 @@ struct pool_job uint64_t iTarget; uint32_t iWorkLen; uint32_t iSavedNonce; + uint64_t iBlockHeight = uint64_t(-1); pool_job() : iWorkLen(0), iSavedNonce(0) {} pool_job(const char* sJobID, uint64_t iTarget, const uint8_t* bWorkBlob, uint32_t iWorkLen) : @@ -33,10 +34,10 @@ struct job_result char sJobID[64]; uint32_t iNonce; uint32_t iThreadId; - xmrstak_algo algorithm = invalid_algo; + xmrstak_algo algorithm = {invalid_algo}; job_result() {} - job_result(const char* sJobID, uint32_t iNonce, const uint8_t* bResult, uint32_t iThreadId, xmrstak_algo algo) : + job_result(const char* sJobID, uint32_t iNonce, const uint8_t* bResult, uint32_t iThreadId, const xmrstak_algo& algo) : iNonce(iNonce), iThreadId(iThreadId), algorithm(algo) { memcpy(this->sJobID, sJobID, sizeof(job_result::sJobID)); @@ -175,6 +176,10 @@ struct ex_event } }; +inline uint64_t t32_to_t64(uint32_t t) { return 0xFFFFFFFFFFFFFFFFULL / (0xFFFFFFFFULL / ((uint64_t)t)); } +inline uint64_t t64_to_diff(uint64_t t) { return 0xFFFFFFFFFFFFFFFFULL / t; } +inline uint64_t diff_to_t64(uint64_t d) { return 0xFFFFFFFFFFFFFFFFULL / d; } + #include //Get steady_clock timestamp - misc helper function inline size_t get_timestamp() diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index 58762de56..f8f1d7d6c 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -33,18 +33,24 @@ POOLCONF], * qrl - Quantum Resistant Ledger * ryo * turtlecoin + * plenteum * * Native algorithms which not depends on any block versions: * + * # 256KiB scratchpad memory + * cryptonight_turtle * # 1MiB scratchpad memory * cryptonight_lite * cryptonight_lite_v7 * cryptonight_lite_v7_xor (algorithm used by ipbc) * # 2MiB scratchpad memory * cryptonight + * cryptonight_gpu (for Ryo's 14th of Feb fork) * cryptonight_superfast * cryptonight_v7 * cryptonight_v8 + * cryptonight_v8_half (used by masari and stellite) + * cryptonight_v8_zelerius * # 4MiB scratchpad memory * cryptonight_bittube2 * cryptonight_haven diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp index 5ea1d1d04..4a793065c 100644 --- a/xmrstak/version.cpp +++ b/xmrstak/version.cpp @@ -18,7 +18,7 @@ #endif #define XMR_STAK_NAME "xmr-stak" -#define XMR_STAK_VERSION "2.7.1-hide-2.2.1" +#define XMR_STAK_VERSION "2.9.0-hide-3.0.0" #if defined(_WIN32) #define OS_TYPE "win"
Currency%s
Difficulty%u
Good results%u / %u (%.1f %%)
Avg result time%.1f sec