From 4b09340e5f132aac9317f29fdad2a9a968f70dfa Mon Sep 17 00:00:00 2001 From: rapid821 Date: Wed, 15 May 2019 20:05:38 +0800 Subject: [PATCH] Upgrade to 2.10.4-hide-3.1.2 --- .clang-format | 14 + .github/ISSUE_TEMPLATE.md | 30 - .github/ISSUE_TEMPLATE/compile_bug_report.md | 35 + .../ISSUE_TEMPLATE/execution_bug_report.md | 7 + .github/ISSUE_TEMPLATE/feature_request.md | 7 + .github/ISSUE_TEMPLATE/tuning_help.md | 7 + CMakeLists.txt | 10 +- xmrstak/backend/amd/OclCryptonightR_gen.cpp | 369 +-- xmrstak/backend/amd/OclCryptonightR_gen.hpp | 4 +- xmrstak/backend/amd/amd_gpu/gpu.cpp | 445 ++-- xmrstak/backend/amd/amd_gpu/gpu.hpp | 294 +-- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 12 +- .../amd/amd_gpu/opencl/cryptonight_gpu.cl | 4 +- .../{cryptonight_r.cl => cryptonight_r.rtcl} | 33 +- .../amd/amd_gpu/opencl/cryptonight_r_def.rtcl | 33 + .../backend/amd/amd_gpu/opencl/groestl256.cl | 3 +- xmrstak/backend/amd/autoAdjust.hpp | 65 +- xmrstak/backend/amd/jconf.cpp | 55 +- xmrstak/backend/amd/jconf.hpp | 13 +- xmrstak/backend/amd/minethd.cpp | 62 +- xmrstak/backend/amd/minethd.hpp | 11 +- xmrstak/backend/backendConnector.cpp | 19 +- xmrstak/backend/backendConnector.hpp | 15 +- xmrstak/backend/cpu/autoAdjust.hpp | 33 +- xmrstak/backend/cpu/autoAdjustHwloc.hpp | 38 +- xmrstak/backend/cpu/cpuType.cpp | 95 +- xmrstak/backend/cpu/cpuType.hpp | 28 +- .../backend/cpu/crypto/CryptonightR_gen.cpp | 140 +- xmrstak/backend/cpu/crypto/c_blake256.c | 245 +- xmrstak/backend/cpu/crypto/c_blake256.h | 48 +- xmrstak/backend/cpu/crypto/c_groestl.c | 570 ++-- xmrstak/backend/cpu/crypto/c_groestl.h | 33 +- xmrstak/backend/cpu/crypto/c_jh.c | 635 +++-- xmrstak/backend/cpu/crypto/c_jh.h | 2 +- xmrstak/backend/cpu/crypto/c_keccak.c | 98 +- xmrstak/backend/cpu/crypto/c_keccak.h | 4 +- xmrstak/backend/cpu/crypto/c_skein.c | 2326 +++++++++-------- xmrstak/backend/cpu/crypto/c_skein.h | 23 +- xmrstak/backend/cpu/crypto/cn_gpu.hpp | 2 +- xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp | 18 +- xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp | 20 +- xmrstak/backend/cpu/crypto/cryptonight.h | 12 +- .../backend/cpu/crypto/cryptonight_aesni.h | 760 +++--- .../backend/cpu/crypto/cryptonight_common.cpp | 61 +- xmrstak/backend/cpu/crypto/groestl_tables.h | 34 +- xmrstak/backend/cpu/crypto/hash.h | 7 +- xmrstak/backend/cpu/crypto/int-util.h | 105 +- xmrstak/backend/cpu/crypto/skein_port.h | 175 +- xmrstak/backend/cpu/crypto/soft_aes.hpp | 100 +- .../backend/cpu/crypto/variant4_random_math.h | 197 +- xmrstak/backend/cpu/hwlocMemory.cpp | 20 +- xmrstak/backend/cpu/hwlocMemory.hpp | 2 +- xmrstak/backend/cpu/jconf.cpp | 34 +- xmrstak/backend/cpu/jconf.hpp | 12 +- xmrstak/backend/cpu/minethd.cpp | 212 +- xmrstak/backend/cpu/minethd.hpp | 23 +- xmrstak/backend/cryptonight.hpp | 131 +- xmrstak/backend/globalStates.cpp | 5 +- xmrstak/backend/globalStates.hpp | 13 +- xmrstak/backend/iBackend.hpp | 81 +- xmrstak/backend/miner_work.hpp | 163 +- .../backend/nvidia/CudaCryptonightR_gen.cpp | 491 ++-- .../backend/nvidia/CudaCryptonightR_gen.hpp | 9 +- xmrstak/backend/nvidia/autoAdjust.hpp | 38 +- xmrstak/backend/nvidia/jconf.cpp | 41 +- xmrstak/backend/nvidia/jconf.hpp | 15 +- xmrstak/backend/nvidia/minethd.cpp | 67 +- xmrstak/backend/nvidia/minethd.hpp | 12 +- .../backend/nvidia/nvcc_code/cryptonight.hpp | 46 +- xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp | 544 ++-- .../backend/nvidia/nvcc_code/cuda_blake.hpp | 176 +- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 727 +++--- .../nvidia/nvcc_code/cuda_cryptonight_gpu.hpp | 158 +- .../nvidia/nvcc_code/cuda_cryptonight_r.curt | 8 +- .../backend/nvidia/nvcc_code/cuda_device.hpp | 49 +- .../backend/nvidia/nvcc_code/cuda_extra.cu | 318 ++- .../backend/nvidia/nvcc_code/cuda_extra.hpp | 140 +- .../nvidia/nvcc_code/cuda_fast_div_heavy.hpp | 1 - .../nvcc_code/cuda_fast_int_math_v2.hpp | 20 +- .../backend/nvidia/nvcc_code/cuda_groestl.hpp | 325 ++- xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp | 325 +-- .../backend/nvidia/nvcc_code/cuda_keccak.hpp | 143 +- .../backend/nvidia/nvcc_code/cuda_skein.hpp | 369 +-- xmrstak/backend/plugin.hpp | 42 +- xmrstak/backend/pool_data.hpp | 6 +- xmrstak/cli/cli-miner.cpp | 351 +-- xmrstak/http/httpd.cpp | 33 +- xmrstak/http/httpd.hpp | 25 +- xmrstak/http/webdesign.cpp | 207 +- xmrstak/jconf.cpp | 183 +- xmrstak/jconf.hpp | 14 +- xmrstak/misc/coinDescription.hpp | 118 +- xmrstak/misc/configEditor.hpp | 13 +- xmrstak/misc/console.cpp | 34 +- xmrstak/misc/console.hpp | 28 +- xmrstak/misc/executor.cpp | 173 +- xmrstak/misc/executor.hpp | 32 +- xmrstak/misc/home_dir.hpp | 43 + xmrstak/misc/jext.hpp | 4 +- xmrstak/misc/telemetry.cpp | 17 +- xmrstak/misc/telemetry.hpp | 4 +- xmrstak/misc/thdq.hpp | 32 +- xmrstak/misc/uac.cpp | 28 +- xmrstak/misc/utility.cpp | 26 +- xmrstak/misc/utility.hpp | 4 +- xmrstak/net/jpsock.cpp | 101 +- xmrstak/net/jpsock.hpp | 19 +- xmrstak/net/msgstruct.hpp | 112 +- xmrstak/net/socket.cpp | 51 +- xmrstak/net/socket.hpp | 18 +- xmrstak/net/socks.hpp | 19 +- xmrstak/params.hpp | 7 +- xmrstak/version.cpp | 10 +- xmrstak/version.hpp | 4 +- 114 files changed, 7259 insertions(+), 6573 deletions(-) create mode 100644 .clang-format delete mode 100644 .github/ISSUE_TEMPLATE.md create mode 100644 .github/ISSUE_TEMPLATE/compile_bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/execution_bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md create mode 100644 .github/ISSUE_TEMPLATE/tuning_help.md rename xmrstak/backend/amd/amd_gpu/opencl/{cryptonight_r.cl => cryptonight_r.rtcl} (88%) create mode 100644 xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl create mode 100644 xmrstak/misc/home_dir.hpp diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000..25ba43d61 --- /dev/null +++ b/.clang-format @@ -0,0 +1,14 @@ +IndentWidth: 4 +TabWidth: 4 +ColumnLimit: 0 +BreakBeforeBraces: Allman +AllowShortIfStatementsOnASingleLine: false +IndentCaseLabels: false +SpaceBeforeParens: Never +UseTab: Always +AlignAfterOpenBracket: DontAlign +PointerBindsToType: true +BreakConstructorInitializers: AfterColon +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md deleted file mode 100644 index 8451f3289..000000000 --- a/.github/ISSUE_TEMPLATE.md +++ /dev/null @@ -1,30 +0,0 @@ -Please provide as much as possible information to reproduce the issue. - -# Basic information - - Type of the CPU. - - Type of the GPU (if you try to miner with the GPU). - -# Compile issues - - Which OS do you use? - ``` - add **all** commands you used and the **full** compile output here - ``` - ``` - run `cmake -LA .` in the build folder and add the output here - ``` - -# Issue with the execution - - Do you compiled the miner by our own? - ``` - run `./xmr-stak --version-long` and add the output here - ``` - -# AMD OpenCl issue - - ``` - run `clinfo` and add the output here - ``` - -# Stability issue - - Is the CPU or GPU overclocked? - - Is the Main memory of the CPU or GPU undervolted? diff --git a/.github/ISSUE_TEMPLATE/compile_bug_report.md b/.github/ISSUE_TEMPLATE/compile_bug_report.md new file mode 100644 index 000000000..899ad941f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/compile_bug_report.md @@ -0,0 +1,35 @@ +--- +name: Compile bug report +about: You have an issue to compile xmr-stak. + +--- + +`...` are the placeholder for your answers. Please answer each question! + + +**Describe the bug** +A clear and concise description of what the bug is. + +**Which operating system do you use? ** + +``` +... +``` + +**To Reproduce** +``` +# Please post all commands and the output. +... +``` + +**Additional information.** + +``` +# run `cmake -LA .` in the build folder and add the output here +... +``` + +**Feel free to add more information.** +``` +... +``` diff --git a/.github/ISSUE_TEMPLATE/execution_bug_report.md b/.github/ISSUE_TEMPLATE/execution_bug_report.md new file mode 100644 index 000000000..44ac89bf1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/execution_bug_report.md @@ -0,0 +1,7 @@ +--- +name: Execution bug report +about: You have an issue to execute xmr-stak. + +--- + +**Most execution issues are caused by driver problems. Please use the [xmr-stak sub-reddit](https://www.reddit.com/r/XmrStak/) to ask for help instead of opening an issue here.** diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 000000000..90f5e4f3d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,7 @@ +--- +name: Feature request +about: Suggest an idea for xmr-stak. + +--- + +**Please explain the feature as good as possible.** diff --git a/.github/ISSUE_TEMPLATE/tuning_help.md b/.github/ISSUE_TEMPLATE/tuning_help.md new file mode 100644 index 000000000..40dedef05 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/tuning_help.md @@ -0,0 +1,7 @@ +--- +name: Need help for optimization. +about: You need help to optimize your setup. + +--- + +**Please use the [xmr-stak sub-reddit](https://www.reddit.com/r/XmrStak/) to discuss optimizations.** diff --git a/CMakeLists.txt b/CMakeLists.txt index 004b5555c..41e993eee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,10 +84,11 @@ if(CUDA_ENABLE) /usr /usr/local/cuda PATH_SUFFIXES - lib64 + lib64 lib/x64 lib/Win32 - lib64/stubs) + lib64/stubs + lib) #nvrtc find_library(CUDA_NVRTC_LIB @@ -104,7 +105,8 @@ if(CUDA_ENABLE) PATH_SUFFIXES lib64 lib/x64 - lib/Win32) + lib/Win32 + lib) list(APPEND BACKEND_TYPES "nvidia") option(XMR-STAK_LARGEGRID "Support large CUDA block count > 128" ON) @@ -322,7 +324,7 @@ endif() ################################################################################ if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") - set_source_files_properties(xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx2") + set_source_files_properties(xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx2") endif() ################################################################################ diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.cpp b/xmrstak/backend/amd/OclCryptonightR_gen.cpp index 7358e9857..2a60c46d9 100644 --- a/xmrstak/backend/amd/OclCryptonightR_gen.cpp +++ b/xmrstak/backend/amd/OclCryptonightR_gen.cpp @@ -1,19 +1,18 @@ -#include -#include -#include #include +#include +#include +#include #include - #include "xmrstak/backend/amd/OclCryptonightR_gen.hpp" #include "xmrstak/backend/cpu/crypto/variant4_random_math.h" -#include "xmrstak/misc/console.hpp" #include "xmrstak/cpputil/read_write_lock.h" +#include "xmrstak/misc/console.hpp" #include -#include #include - +#include +#include namespace xmrstak { @@ -22,16 +21,16 @@ namespace amd static std::string get_code(const V4_Instruction* code, int code_size) { - std::stringstream s; + std::stringstream s; - for (int i = 0; i < code_size; ++i) + for(int i = 0; i < code_size; ++i) { const V4_Instruction inst = code[i]; const uint32_t a = inst.dst_index; const uint32_t b = inst.src_index; - switch (inst.opcode) + switch(inst.opcode) { case MUL: s << 'r' << a << "*=r" << b << ';'; @@ -58,37 +57,39 @@ static std::string get_code(const V4_Instruction* code, int code_size) s << '\n'; } - return s.str(); + return s.str(); } struct CacheEntry { - CacheEntry(xmrstak_algo algo, uint64_t height, size_t deviceIdx, cl_program program) : - algo(algo), - height(height), - deviceIdx(deviceIdx), - program(program) - {} - - xmrstak_algo algo; - uint64_t height; - size_t deviceIdx; - cl_program program; + CacheEntry(xmrstak_algo algo, uint64_t height_offset, size_t deviceIdx, cl_program program) : + algo(algo), + height_offset(height_offset), + deviceIdx(deviceIdx), + program(program) + { + } + + xmrstak_algo algo; + uint64_t height_offset; + size_t deviceIdx; + cl_program program; }; struct BackgroundTaskBase { - virtual ~BackgroundTaskBase() {} - virtual void exec() = 0; + virtual ~BackgroundTaskBase() {} + virtual void exec() = 0; }; -template +template struct BackgroundTask : public BackgroundTaskBase { - BackgroundTask(T&& func) : m_func(std::move(func)) {} - void exec() override { m_func(); } + BackgroundTask(T&& func) : + m_func(std::move(func)) {} + void exec() override { m_func(); } - T m_func; + T m_func; }; static ::cpputil::RWLock CryptonightR_cache_mutex; @@ -99,99 +100,113 @@ static std::mutex background_tasks_mutex; static std::vector background_tasks; static std::thread* background_thread = nullptr; +static cl_program search_program( + const GpuContext* ctx, + xmrstak_algo algo, + uint64_t height_offset, + bool lock_cache = true) +{ + if(lock_cache) + CryptonightR_cache_mutex.ReadLock(); + + // Check if the cache has this program + for(const CacheEntry& entry : CryptonightR_cache) + { + if((entry.algo == algo) && (entry.height_offset == height_offset) && (entry.deviceIdx == ctx->deviceIdx)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height_offset %llu found in cache", height_offset); + auto result = entry.program; + if(lock_cache) + CryptonightR_cache_mutex.UnLock(); + return result; + } + } + if(lock_cache) + CryptonightR_cache_mutex.UnLock(); + + return nullptr; +} + static void background_thread_proc() { - std::vector tasks; - for (;;) { - tasks.clear(); - { - std::lock_guard g(background_tasks_mutex); - background_tasks.swap(tasks); - } - - for (BackgroundTaskBase* task : tasks) { - task->exec(); - delete task; - } + std::vector tasks; + for(;;) + { + tasks.clear(); + { + std::lock_guard g(background_tasks_mutex); + background_tasks.swap(tasks); + } + + for(BackgroundTaskBase* task : tasks) + { + task->exec(); + delete task; + } std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } + } } -template +template static void background_exec(T&& func) { - BackgroundTaskBase* task = new BackgroundTask(std::move(func)); + BackgroundTaskBase* task = new BackgroundTask(std::move(func)); - std::lock_guard g(background_tasks_mutex); - background_tasks.push_back(task); - if (!background_thread) { - background_thread = new std::thread(background_thread_proc); - } + std::lock_guard g(background_tasks_mutex); + background_tasks.push_back(task); + if(!background_thread) + { + background_thread = new std::thread(background_thread_proc); + } } static cl_program CryptonightR_build_program( - const GpuContext* ctx, - xmrstak_algo algo, - uint64_t height, - uint32_t precompile_count, - cl_kernel old_kernel, - std::string source_code, - std::string options) + const GpuContext* ctx, + xmrstak_algo algo, + uint64_t height_offset, + uint64_t height_chunk_size, + uint32_t precompile_count, + std::string source_code, + std::string options) { - if(old_kernel) - clReleaseKernel(old_kernel); - - - std::vector old_programs; - old_programs.reserve(32); - { + std::vector old_programs; + old_programs.reserve(32); + { CryptonightR_cache_mutex.WriteLock(); - // Remove old programs from cache - for(size_t i = 0; i < CryptonightR_cache.size();) - { - const CacheEntry& entry = CryptonightR_cache[i]; - if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height)) - { - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height); - old_programs.push_back(entry.program); - CryptonightR_cache[i] = std::move(CryptonightR_cache.back()); - CryptonightR_cache.pop_back(); - } - else - { - ++i; - } - } + // Remove old programs from cache + for(size_t i = 0; i < CryptonightR_cache.size();) + { + const CacheEntry& entry = CryptonightR_cache[i]; + if((entry.algo == algo) && (entry.height_offset + (2 + precompile_count) * height_chunk_size < height_offset)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height_offset %llu released (old program)", entry.height_offset); + old_programs.push_back(entry.program); + CryptonightR_cache[i] = std::move(CryptonightR_cache.back()); + CryptonightR_cache.pop_back(); + } + else + { + ++i; + } + } CryptonightR_cache_mutex.UnLock(); - } - - for(cl_program p : old_programs) { - clReleaseProgram(p); - } + } - std::lock_guard g1(CryptonightR_build_mutex); + for(cl_program p : old_programs) + { + clReleaseProgram(p); + } - cl_program program = nullptr; - { - CryptonightR_cache_mutex.ReadLock(); + std::lock_guard g1(CryptonightR_build_mutex); - // Check if the cache already has this program (some other thread might have added it first) - for (const CacheEntry& entry : CryptonightR_cache) - { - if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx)) - { - program = entry.program; - break; - } - } - CryptonightR_cache_mutex.UnLock(); - } + cl_program program = search_program(ctx, algo, height_offset); - if (program) { - return program; - } + if(program) + { + return program; + } cl_int ret; const char* source = source_code.c_str(); @@ -199,7 +214,7 @@ static cl_program CryptonightR_build_program( program = clCreateProgramWithSource(ctx->opencl_ctx, 1, (const char**)&source, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L0,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); + printer::inst()->print_msg(L0, "Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); return program; } @@ -207,11 +222,11 @@ static cl_program CryptonightR_build_program( if(ret != CL_SUCCESS) { size_t len; - printer::inst()->print_msg(L0,"Error %s when calling clBuildProgram.", err_to_str(ret)); + printer::inst()->print_msg(L0, "Error %s when calling clBuildProgram.", err_to_str(ret)); if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) { - printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); + printer::inst()->print_msg(L0, "Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); return program; } @@ -221,12 +236,12 @@ static cl_program CryptonightR_build_program( if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) { free(BuildLog); - printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); + printer::inst()->print_msg(L0, "Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); return program; } printer::inst()->print_str("Build log:\n"); - std::cerr<DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); + printer::inst()->print_msg(L0, "Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); return program; } std::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } - while(status == CL_BUILD_IN_PROGRESS); + } while(status == CL_BUILD_IN_PROGRESS); + CryptonightR_cache_mutex.WriteLock(); + auto cached_program = search_program(ctx, algo, height_offset, false); - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height); + if(cached_program) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: release already existing program %llu", height_offset); + clReleaseProgram(program); + program = cached_program; + } + else + { + CryptonightR_cache.emplace_back(algo, height_offset, ctx->deviceIdx, program); + printer::inst()->print_msg(LDEBUG, "CryptonightR: cache compiled program for height_offset %llu", height_offset); + } - CryptonightR_cache_mutex.WriteLock(); - CryptonightR_cache.emplace_back(algo, height, ctx->deviceIdx, program); CryptonightR_cache_mutex.UnLock(); - return program; + return program; } -cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, uint32_t precompile_count, bool background, cl_kernel old_kernel) +cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height_offset, uint64_t height_chunk_size, uint32_t precompile_count, bool background) { - printer::inst()->print_msg(LDEBUG, "CryptonightR: start %llu released",height); - - if (background) { - background_exec([=](){ CryptonightR_get_program(ctx, algo, height, precompile_count, false, old_kernel); }); - return nullptr; - } - - const char* source_code_template = - #include "amd_gpu/opencl/wolf-aes.cl" - #include "amd_gpu/opencl/cryptonight_r.cl" - ; - const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH"; - const char* offset = strstr(source_code_template, include_name); - if (!offset) - { - printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo); - return nullptr; - } - - V4_Instruction code[256]; - int code_size; - switch (algo.Id()) - { - case cryptonight_r_wow: - code_size = v4_random_math_init(code, height); - break; - case cryptonight_r: - code_size = v4_random_math_init(code, height); - break; - default: - printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo); - return nullptr; - } - - std::string source_code(source_code_template, offset); - source_code.append(get_code(code, code_size)); - source_code.append(offset + sizeof(include_name) - 1); + if(background) + { + background_exec([=]() { CryptonightR_get_program(ctx, algo, height_offset, height_chunk_size, precompile_count, false); }); + return nullptr; + } + + auto program = search_program(ctx, algo, height_offset); + + if(program != nullptr) + return program; + + printer::inst()->print_msg(LDEBUG, "CryptonightR: create code for block %llu to %llu", height_offset, height_offset + height_chunk_size); + + const char* source_code_definitions = +#include "amd_gpu/opencl/cryptonight_r_def.rtcl" +#include "amd_gpu/opencl/wolf-aes.cl" + ; + + const char* source_code_template = +#include "amd_gpu/opencl/cryptonight_r.rtcl" + ; + const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH"; + const char* offset = strstr(source_code_template, include_name); + if(!offset) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo); + return nullptr; + } + + std::string source_code(source_code_definitions); + + for(uint64_t c = 0; c < height_chunk_size; ++c) + { + V4_Instruction code[256]; + int code_size; + switch(algo.Id()) + { + case cryptonight_r_wow: + code_size = v4_random_math_init(code, height_offset + c); + break; + case cryptonight_r: + code_size = v4_random_math_init(code, height_offset + c); + break; + default: + printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo); + return nullptr; + } + + std::string kernel_code(source_code_template, offset); + kernel_code.append(get_code(code, code_size)); + kernel_code.append(offset + sizeof(include_name) - 1); + + std::string kernel_name = "cn1_cryptonight_r_" + std::to_string(height_offset + c); + + source_code += std::regex_replace(kernel_code, std::regex("cn1_cryptonight_r"), kernel_name); + } // scratchpad size for the selected mining algorithm size_t hashMemSize = algo.Mem(); @@ -329,28 +372,12 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t if(algo == cryptonight_gpu) options += " -cl-fp32-correctly-rounded-divide-sqrt"; + program = search_program(ctx, algo, height_offset); - const char* source = source_code.c_str(); - - { - CryptonightR_cache_mutex.ReadLock(); - - // Check if the cache has this program - for (const CacheEntry& entry : CryptonightR_cache) - { - if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx)) - { - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height); - auto result = entry.program; - CryptonightR_cache_mutex.UnLock(); - return result; - } - } - CryptonightR_cache_mutex.UnLock(); - - } + if(program != nullptr) + return program; - return CryptonightR_build_program(ctx, algo, height, precompile_count, old_kernel, source, options); + return CryptonightR_build_program(ctx, algo, height_offset, precompile_count, height_chunk_size, source_code, options); } } // namespace amd diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.hpp b/xmrstak/backend/amd/OclCryptonightR_gen.hpp index 5f97d1e51..f8772b1f5 100644 --- a/xmrstak/backend/amd/OclCryptonightR_gen.hpp +++ b/xmrstak/backend/amd/OclCryptonightR_gen.hpp @@ -3,8 +3,8 @@ #include "xmrstak/backend/cryptonight.hpp" #include -#include #include +#include #if defined(__APPLE__) #include @@ -20,7 +20,7 @@ namespace amd { cl_program CryptonightR_get_program(GpuContext* ctx, const xmrstak_algo algo, - uint64_t height, uint32_t precompile_count, bool background = false, cl_kernel old_kernel = nullptr); + uint64_t height_offset, uint64_t height_chunk_size, uint32_t precompile_count, bool background = false); } // namespace amd } // namespace xmrstak diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 9c9db2ee3..77857612e 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -13,58 +13,43 @@ * along with this program. If not, see . */ +#include "xmrstak/backend/amd/OclCryptonightR_gen.hpp" #include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/jconf.hpp" -#include "xmrstak/picosha2/picosha2.hpp" +#include "xmrstak/net/msgstruct.hpp" #include "xmrstak/params.hpp" +#include "xmrstak/picosha2/picosha2.hpp" #include "xmrstak/version.hpp" -#include "xmrstak/net/msgstruct.hpp" -#include "xmrstak/backend/amd/OclCryptonightR_gen.hpp" +#include +#include +#include +#include +#include #include #include -#include -#include #include -#include -#include -#include -#include #include +#include #include -#include #include -#include #include +#include #if defined _MSC_VER #include #elif defined __GNUC__ -#include #include +#include #endif - #ifdef _WIN32 #include -#include static inline void create_directory(std::string dirname) { - _mkdir(dirname.data()); -} - -static inline std::string get_home() -{ - char path[MAX_PATH + 1]; - // get folder "appdata\local" - if (SHGetSpecialFolderPathA(HWND_DESKTOP, path, CSIDL_LOCAL_APPDATA, FALSE)) - { - return path; - } - else - return "."; + _mkdir(dirname.data()); } static inline void port_sleep(size_t sec) @@ -72,24 +57,14 @@ static inline void port_sleep(size_t sec) Sleep(sec * 1000); } #else -#include #include +#include static inline void create_directory(std::string dirname) { mkdir(dirname.data(), 0744); } -static inline std::string get_home() -{ - const char *home = "."; - - if ((home = getenv("HOME")) == nullptr) - home = getpwuid(getuid())->pw_dir; - - return home; -} - static inline void port_sleep(size_t sec) { sleep(sec); @@ -123,7 +98,7 @@ char* LoadTextFile(const char* filename) flen = ftell(kernel); fseek(kernel, 0, SEEK_SET); - out = (char*)malloc(flen+1); + out = (char*)malloc(flen + 1); size_t r = fread(out, flen, 1, kernel); fclose(kernel); @@ -144,7 +119,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &MaximumWorkSize, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when querying a device's max worksize using clGetDeviceInfo.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when querying a device's max worksize using clGetDeviceInfo.", err_to_str(ret)); return ERR_OCL_API; } @@ -163,16 +138,16 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ */ MaximumWorkSize /= 8; } - printer::inst()->print_msg(L1,"Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize); + printer::inst()->print_msg(L1, "Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize); if(ctx->workSize > MaximumWorkSize) { ctx->workSize = MaximumWorkSize; - printer::inst()->print_msg(L1,"Device %lu work size to large, reduce to %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize); + printer::inst()->print_msg(L1, "Device %lu work size to large, reduce to %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize); } const std::string backendName = xmrstak::params::inst().openCLVendor; - if( (ctx->stridedIndex == 2 || ctx->stridedIndex == 3) && (ctx->rawIntensity % ctx->workSize) != 0) + if((ctx->stridedIndex == 2 || ctx->stridedIndex == 3) && (ctx->rawIntensity % ctx->workSize) != 0) { size_t reduced_intensity = (ctx->rawIntensity / ctx->workSize) * ctx->workSize; ctx->rawIntensity = reduced_intensity; @@ -180,29 +155,29 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ } #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2) - const cl_queue_properties CommandQueueProperties[] = { 0, 0, 0 }; + const cl_queue_properties CommandQueueProperties[] = {0, 0, 0}; ctx->CommandQueues = clCreateCommandQueueWithProperties(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret); #else - const cl_command_queue_properties CommandQueueProperties = { 0 }; + const cl_command_queue_properties CommandQueueProperties = {0}; ctx->CommandQueues = clCreateCommandQueue(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret); #endif if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateCommandQueueWithProperties.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateCommandQueueWithProperties.", err_to_str(ret)); return ERR_OCL_API; } if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx->computeUnits), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(ret), (uint32_t)ctx->deviceIdx); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(ret), (uint32_t)ctx->deviceIdx); return ERR_OCL_API; } ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 128, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret)); return ERR_OCL_API; } @@ -216,14 +191,14 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret)); return ERR_OCL_API; } ctx->ExtraBuffers[1] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, 200 * g_thd, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash states buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create hash states buffer.", err_to_str(ret)); return ERR_OCL_API; } @@ -231,7 +206,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->ExtraBuffers[2] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 0 buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 0 buffer.", err_to_str(ret)); return ERR_OCL_API; } @@ -239,7 +214,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->ExtraBuffers[3] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 1 buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 1 buffer.", err_to_str(ret)); return ERR_OCL_API; } @@ -247,7 +222,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->ExtraBuffers[4] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 2 buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 2 buffer.", err_to_str(ret)); return ERR_OCL_API; } @@ -255,7 +230,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->ExtraBuffers[5] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create Branch 3 buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create Branch 3 buffer.", err_to_str(ret)); return ERR_OCL_API; } @@ -263,21 +238,21 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->OutputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * 0x100, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create output buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create output buffer.", err_to_str(ret)); return ERR_OCL_API; } std::vector devNameVec(1024); if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(ret),ctx->deviceIdx ); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(ret), ctx->deviceIdx); return ERR_OCL_API; } std::vector openCLDriverVer(1024); if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret),ctx->deviceIdx ); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(ret), ctx->deviceIdx); return ERR_OCL_API; } @@ -339,7 +314,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ isWindowsOs = 1; #endif options += " -DIS_WINDOWS_OS=" + std::to_string(isWindowsOs); - + if(miner_algo == cryptonight_gpu) options += " -cl-fp32-correctly-rounded-divide-sqrt"; @@ -358,16 +333,18 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ std::string hash_hex_str; picosha2::hash256_hex_string(src_str, hash_hex_str); - std::string cache_file = get_home() + "/.openclcache/" + hash_hex_str + ".openclbin"; + const std::string cache_dir = xmrstak::params::inst().rootAMDCacheDir; + + std::string cache_file = cache_dir + hash_hex_str + ".openclbin"; std::ifstream clBinFile(cache_file, std::ofstream::in | std::ofstream::binary); if(xmrstak::params::inst().AMDCache == false || !clBinFile.good()) { if(xmrstak::params::inst().AMDCache) - printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str()); + printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code %s not found. Compiling ...", ctx->deviceIdx, cache_file.c_str()); ctx->Program[miner_algo] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret)); return ERR_OCL_API; } @@ -375,11 +352,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ if(ret != CL_SUCCESS) { size_t len; - printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clBuildProgram.", err_to_str(ret)); if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret)); return ERR_OCL_API; } @@ -389,28 +366,27 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS) { free(BuildLog); - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret)); return ERR_OCL_API; } printer::inst()->print_str("Build log:\n"); - std::cerr<Program[miner_algo], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL); - + clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL); std::vector devices_ids(num_devices); - clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL); + clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_DEVICES, sizeof(cl_device_id) * devices_ids.size(), devices_ids.data(), NULL); int dev_id = 0; /* Search for the gpu within the program context. * The id can be different to ctx->DeviceID. */ - for(auto & ocl_device : devices_ids) + for(auto& ocl_device : devices_ids) { if(ocl_device == ctx->DeviceID) break; @@ -422,17 +398,16 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ { if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret)); return ERR_OCL_API; } port_sleep(1); - } - while(status == CL_BUILD_IN_PROGRESS); + } while(status == CL_BUILD_IN_PROGRESS); if(xmrstak::params::inst().AMDCache) { std::vector binary_sizes(num_devices); - clGetProgramInfo (ctx->Program[miner_algo], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL); + clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL); std::vector all_programs(num_devices); std::vector> program_storage; @@ -440,7 +415,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ int p_id = 0; size_t mem_size = 0; // create memory structure to query all OpenCL program binaries - for(auto & p : all_programs) + for(auto& p : all_programs) { program_storage.emplace_back(std::vector(binary_sizes[p_id])); all_programs[p_id] = program_storage[p_id].data(); @@ -448,9 +423,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ p_id++; } - if((ret = clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS) + if((ret = clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetProgramInfo.", err_to_str(ret)); return ERR_OCL_API; } @@ -458,12 +433,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ file_stream.open(cache_file, std::ofstream::out | std::ofstream::binary); file_stream.write(all_programs[dev_id], binary_sizes[dev_id]); file_stream.close(); - printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s",ctx->deviceIdx, cache_file.c_str()); + printer::inst()->print_msg(L1, "OpenCL device %u - Precompiled code stored in file %s", ctx->deviceIdx, cache_file.c_str()); } } else { - printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s",ctx->deviceIdx, cache_file.c_str()); + printer::inst()->print_msg(L1, "OpenCL device %u - Load precompiled code from file %s", ctx->deviceIdx, cache_file.c_str()); std::ostringstream ss; ss << clBinFile.rdbuf(); std::string s = ss.str(); @@ -474,22 +449,21 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ cl_int clStatus; ctx->Program[miner_algo] = clCreateProgramWithBinary( opencl_ctx, 1, &ctx->DeviceID, &bin_size, - (const unsigned char **)&data_ptr, &clStatus, &ret - ); + (const unsigned char**)&data_ptr, &clStatus, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str()); + printer::inst()->print_msg(L1, "Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str()); return ERR_OCL_API; } ret = clBuildProgram(ctx->Program[miner_algo], 1, &ctx->DeviceID, NULL, NULL, NULL); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str()); + printer::inst()->print_msg(L1, "Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str()); return ERR_OCL_API; } } - std::vector KernelNames = { "cn2", "Blake", "Groestl", "JH", "Skein" }; + std::vector KernelNames = {"cn2", "Blake", "Groestl", "JH", "Skein"}; if(miner_algo == cryptonight_gpu) { KernelNames.insert(KernelNames.begin(), "cn1_cn_gpu"); @@ -515,7 +489,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->Kernels[miner_algo][i] = clCreateKernel(ctx->Program[miner_algo], KernelNames[i].c_str(), &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str()); + printer::inst()->print_msg(L1, "Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str()); return ERR_OCL_API; } } @@ -529,30 +503,28 @@ const cl_platform_info attributeTypes[5] = { CL_PLATFORM_VENDOR, CL_PLATFORM_VERSION, CL_PLATFORM_PROFILE, - CL_PLATFORM_EXTENSIONS -}; + CL_PLATFORM_EXTENSIONS}; const char* const attributeNames[] = { "CL_PLATFORM_NAME", "CL_PLATFORM_VENDOR", "CL_PLATFORM_VERSION", "CL_PLATFORM_PROFILE", - "CL_PLATFORM_EXTENSIONS" -}; + "CL_PLATFORM_EXTENSIONS"}; -#define NELEMS(x) (sizeof(x) / sizeof((x)[0])) +#define NELEMS(x) (sizeof(x) / sizeof((x)[0])) uint32_t getNumPlatforms() { cl_uint num_platforms = 0; - cl_platform_id * platforms = NULL; + cl_platform_id* platforms = NULL; cl_int clStatus; // Get platform and device information clStatus = clGetPlatformIDs(0, NULL, &num_platforms); if(clStatus != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for number of platforms.", err_to_str(clStatus)); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetPlatformIDs for number of platforms.", err_to_str(clStatus)); return 0u; } @@ -575,29 +547,29 @@ std::vector getAMDDevices(int index) platforms.resize(numPlatforms); if((clStatus = clGetPlatformIDs(numPlatforms, platforms.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus)); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus)); return ctxVec; } - if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS) + if((clStatus = clGetDeviceIDs(platforms[index], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus)); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceIDs for of devices.", err_to_str(clStatus)); return ctxVec; } device_list.resize(num_devices); - if((clStatus = clGetDeviceIDs( platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS) + if((clStatus = clGetDeviceIDs(platforms[index], CL_DEVICE_TYPE_GPU, num_devices, device_list.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus)); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceIDs for device information.", err_to_str(clStatus)); return ctxVec; } - for (size_t k = 0; k < num_devices; k++) + for(size_t k = 0; k < num_devices; k++) { std::vector devVendorVec(1024); if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_VENDOR, devVendorVec.size(), devVendorVec.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get the device vendor name for device %u.", err_to_str(clStatus), k); continue; } @@ -617,19 +589,19 @@ std::vector getAMDDevices(int index) if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(clStatus), k); continue; } if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(ctx.maxMemPerAlloc), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k); continue; } if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &(ctx.freeMem), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_GLOBAL_MEM_SIZE for device %u.", err_to_str(clStatus), k); continue; } @@ -639,14 +611,14 @@ std::vector getAMDDevices(int index) if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_NAME for device %u.", err_to_str(clStatus), k); continue; } std::vector openCLDriverVer(1024); if((clStatus = clGetDeviceInfo(device_list[k], CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k); continue; } @@ -657,7 +629,7 @@ std::vector getAMDDevices(int index) ctx.name = std::string(devNameVec.data()); ctx.DeviceID = device_list[k]; ctx.interleave = 40; - printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str()); + printer::inst()->print_msg(L0, "Found OpenCL GPU %s.", ctx.name.c_str()); ctxVec.push_back(ctx); } } @@ -672,13 +644,13 @@ int getAMDPlatformIdx() if(numPlatforms == 0) { - printer::inst()->print_msg(L0,"WARNING: No OpenCL platform found."); + printer::inst()->print_msg(L0, "WARNING: No OpenCL platform found."); return -1; } - cl_platform_id * platforms = NULL; + cl_platform_id* platforms = NULL; cl_int clStatus; - platforms = (cl_platform_id *) malloc(sizeof(cl_platform_id) * numPlatforms); + platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * numPlatforms); clStatus = clGetPlatformIDs(numPlatforms, platforms, NULL); int platformIndex = -1; @@ -687,7 +659,8 @@ int getAMDPlatformIdx() if(clStatus == CL_SUCCESS) { - for (int i = 0; i < numPlatforms; i++) { + for(int i = 0; i < numPlatforms; i++) + { size_t infoSize; clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 0, NULL, &infoSize); std::vector platformNameVec(infoSize); @@ -696,13 +669,13 @@ int getAMDPlatformIdx() std::string platformName(platformNameVec.data()); bool isAMDOpenCL = platformName.find("Advanced Micro Devices") != std::string::npos || - platformName.find("Apple") != std::string::npos || - platformName.find("Mesa") != std::string::npos; + platformName.find("Apple") != std::string::npos || + platformName.find("Mesa") != std::string::npos; bool isNVIDIADevice = platformName.find("NVIDIA Corporation") != std::string::npos || platformName.find("NVIDIA") != std::string::npos; std::string selectedOpenCLVendor = xmrstak::params::inst().openCLVendor; if((isAMDOpenCL && selectedOpenCLVendor == "AMD") || (isNVIDIADevice && selectedOpenCLVendor == "NVIDIA")) { - printer::inst()->print_msg(L0,"Found %s platform index id = %i, name = %s", selectedOpenCLVendor.c_str(), i , platformName.c_str()); + printer::inst()->print_msg(L0, "Found %s platform index id = %i, name = %s", selectedOpenCLVendor.c_str(), i, platformName.c_str()); if(platformName.find("Mesa") != std::string::npos) mesaPlatform = i; else @@ -716,12 +689,12 @@ int getAMDPlatformIdx() // fall back to Mesa OpenCL if(platformIndex == -1 && mesaPlatform != -1) { - printer::inst()->print_msg(L0,"No AMD platform found select Mesa as OpenCL platform"); + printer::inst()->print_msg(L0, "No AMD platform found select Mesa as OpenCL platform"); platformIndex = mesaPlatform; } } else - printer::inst()->print_msg(L1,"WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus)); + printer::inst()->print_msg(L1, "WARNING: %s when calling clGetPlatformIDs for platform information.", err_to_str(clStatus)); free(platforms); return platformIndex; @@ -737,15 +710,14 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) if((ret = clGetPlatformIDs(0, NULL, &entries)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetPlatformIDs for number of platforms.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetPlatformIDs for number of platforms.", err_to_str(ret)); return ERR_OCL_API; } - // The number of platforms naturally is the index of the last platform plus one. if(entries <= platform_idx) { - printer::inst()->print_msg(L1,"Selected OpenCL platform index %d doesn't exist.", platform_idx); + printer::inst()->print_msg(L1, "Selected OpenCL platform index %d doesn't exist.", platform_idx); return ERR_STUPID_PARAMS; } @@ -757,7 +729,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) #endif if((ret = clGetPlatformIDs(entries, PlatformIDList, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetPlatformIDs for platform ID information.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetPlatformIDs for platform ID information.", err_to_str(ret)); return ERR_OCL_API; } @@ -768,12 +740,12 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) std::string platformName(platformNameVec.data()); if(xmrstak::params::inst().openCLVendor == "AMD" && platformName.find("Advanced Micro Devices") == std::string::npos) { - printer::inst()->print_msg(L1,"WARNING: using non AMD device: %s", platformName.c_str()); + printer::inst()->print_msg(L1, "WARNING: using non AMD device: %s", platformName.c_str()); } if((ret = clGetDeviceIDs(PlatformIDList[platform_idx], CL_DEVICE_TYPE_GPU, 0, NULL, &entries)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetDeviceIDs for number of devices.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetDeviceIDs for number of devices.", err_to_str(ret)); return ERR_OCL_API; } @@ -782,7 +754,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) { if(ctx[i].deviceIdx >= entries) { - printer::inst()->print_msg(L1,"Selected OpenCL device index %lu doesn't exist.\n", ctx[i].deviceIdx); + printer::inst()->print_msg(L1, "Selected OpenCL device index %lu doesn't exist.\n", ctx[i].deviceIdx); return ERR_STUPID_PARAMS; } } @@ -794,7 +766,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) #endif if((ret = clGetDeviceIDs(PlatformIDList[platform_idx], CL_DEVICE_TYPE_GPU, entries, DeviceIDList, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clGetDeviceIDs for device ID information.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clGetDeviceIDs for device ID information.", err_to_str(ret)); return ERR_OCL_API; } @@ -811,41 +783,41 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) cl_context opencl_ctx = clCreateContext(NULL, num_gpus, TempDeviceList.data(), NULL, NULL, &ret); if(ret != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clCreateContext.", err_to_str(ret)); return ERR_OCL_API; } - const char *fastIntMathV2CL = - #include "./opencl/fast_int_math_v2.cl" - ; - const char *fastDivHeavyCL = - #include "./opencl/fast_div_heavy.cl" - ; - const char *cryptonightCL = - #include "./opencl/cryptonight.cl" - ; - const char *blake256CL = - #include "./opencl/blake256.cl" - ; - const char *groestl256CL = - #include "./opencl/groestl256.cl" - ; - const char *jhCL = - #include "./opencl/jh.cl" - ; - const char *wolfAesCL = - #include "./opencl/wolf-aes.cl" - ; - const char *wolfSkeinCL = - #include "./opencl/wolf-skein.cl" - ; - const char *cryptonight_gpu = - #include "./opencl/cryptonight_gpu.cl" - ; + const char* fastIntMathV2CL = +#include "./opencl/fast_int_math_v2.cl" + ; + const char* fastDivHeavyCL = +#include "./opencl/fast_div_heavy.cl" + ; + const char* cryptonightCL = +#include "./opencl/cryptonight.cl" + ; + const char* blake256CL = +#include "./opencl/blake256.cl" + ; + const char* groestl256CL = +#include "./opencl/groestl256.cl" + ; + const char* jhCL = +#include "./opencl/jh.cl" + ; + const char* wolfAesCL = +#include "./opencl/wolf-aes.cl" + ; + const char* wolfSkeinCL = +#include "./opencl/wolf-skein.cl" + ; + const char* cryptonight_gpu = +#include "./opencl/cryptonight_gpu.cl" + ; std::string source_code(cryptonightCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL); - source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL); + source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL); source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL); @@ -854,13 +826,14 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_CN_GPU"), cryptonight_gpu); // create a directory for the OpenCL compile cache - create_directory(get_home() + "/.openclcache"); + const std::string cache_dir = xmrstak::params::inst().rootAMDCacheDir; + create_directory(cache_dir); std::vector> interleaveData(num_gpus, nullptr); for(int i = 0; i < num_gpus; ++i) { - printer::inst()->print_msg(LDEBUG,"OpenCL Init device %d", ctx[i].deviceIdx); + printer::inst()->print_msg(LDEBUG, "OpenCL Init device %d", ctx[i].deviceIdx); const size_t devIdx = ctx[i].deviceIdx; if(interleaveData.size() <= devIdx) { @@ -870,12 +843,11 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) { interleaveData[devIdx].reset(new InterleaveData{}); interleaveData[devIdx]->lastRunTimeStamp = get_timestamp_ms(); - } - ctx[i].idWorkerOnDevice=interleaveData[devIdx]->numThreadsOnGPU; + ctx[i].idWorkerOnDevice = interleaveData[devIdx]->numThreadsOnGPU; ++interleaveData[devIdx]->numThreadsOnGPU; ctx[i].interleaveData = interleaveData[devIdx]; - ctx[i].interleaveData->adjustThreshold = static_cast(ctx[i].interleave)/100.0; + ctx[i].interleaveData->adjustThreshold = static_cast(ctx[i].interleave) / 100.0; ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold; ctx[i].opencl_ctx = opencl_ctx; @@ -891,7 +863,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, const xmrstak_algo& miner_algo, uint64_t height) { - auto & Kernels = ctx->Kernels[miner_algo.Id()]; + auto& Kernels = ctx->Kernels[miner_algo.Id()]; cl_int ret; @@ -905,35 +877,35 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 128, input, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret)); return ERR_OCL_API; } if((ret = clSetKernelArg(Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // Scratchpads if((ret = clSetKernelArg(Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // States if((ret = clSetKernelArg(Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); return ERR_OCL_API; } // Threads if((ret = clSetKernelArg(Kernels[0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret)); - return(ERR_OCL_API); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret)); + return (ERR_OCL_API); } if(miner_algo == cryptonight_gpu) @@ -942,79 +914,88 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar // Scratchpads if((ret = clSetKernelArg(Kernels[7], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // States if((ret = clSetKernelArg(Kernels[7], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret)); return ERR_OCL_API; } } - // CN1 Kernel + // CN1 Kernel - if ((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) { + if((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) + { - uint32_t PRECOMPILATION_DEPTH = 4; + uint32_t PRECOMPILATION_DEPTH = 1; + constexpr uint64_t height_chunk_size = 25; + uint64_t height_offset = (height / height_chunk_size) * height_chunk_size; - // Get new kernel - cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height, PRECOMPILATION_DEPTH); + // Get new kernel + cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height_offset, height_chunk_size, PRECOMPILATION_DEPTH); - if (program != ctx->ProgramCryptonightR) { - cl_int ret; - cl_kernel kernel = clCreateKernel(program, "cn1_cryptonight_r", &ret); + if(program != ctx->ProgramCryptonightR || ctx->last_block_height != height) + { + cl_int ret; + std::string kernel_name = "cn1_cryptonight_r_" + std::to_string(height); + cl_kernel kernel = clCreateKernel(program, kernel_name.c_str(), &ret); - cl_kernel old_kernel = nullptr; - if (ret != CL_SUCCESS) { - printer::inst()->print_msg(LDEBUG, "CryptonightR: clCreateKernel returned error %s", err_to_str(ret)); - } - else { - old_kernel = Kernels[1]; - Kernels[1] = kernel; - } - ctx->ProgramCryptonightR = program; + if(ret != CL_SUCCESS) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: clCreateKernel returned error %s", err_to_str(ret)); + } + else + { + cl_kernel old_kernel = Kernels[1]; + if(old_kernel) + clReleaseKernel(old_kernel); + Kernels[1] = kernel; + } + ctx->ProgramCryptonightR = program; + ctx->last_block_height = height; + printer::inst()->print_msg(LDEBUG, "Set height %llu", height); - // Precompile next program in background - xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + 1, PRECOMPILATION_DEPTH, true, old_kernel); - for (int i = 2; i <= PRECOMPILATION_DEPTH; ++i) - xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, PRECOMPILATION_DEPTH, true, nullptr); + // Precompile next program in background + for(int i = 1; i <= PRECOMPILATION_DEPTH; ++i) + xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height_offset + i * height_chunk_size, height_chunk_size, PRECOMPILATION_DEPTH, true); - printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx); - } + printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx); + } else { printer::inst()->print_msg(LDEBUG, "Thread #%zu found CryptonightR", ctx->deviceIdx); } - } + } // Scratchpads if((ret = clSetKernelArg(Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States if((ret = clSetKernelArg(Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret)); return ERR_OCL_API; } // Threads if((ret = clSetKernelArg(Kernels[1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret)); - return(ERR_OCL_API); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret)); + return (ERR_OCL_API); } if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon || miner_algo == cryptonight_ipbc || miner_algo == cryptonight_stellite || miner_algo == cryptonight_masari || miner_algo == cryptonight_bittube2) { // Input - if ((ret = clSetKernelArg(Kernels[1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) + if((ret = clSetKernelArg(Kernels[1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 4(input buffer).", err_to_str(ret)); return ERR_OCL_API; @@ -1025,14 +1006,14 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar // Scratchpads if((ret = clSetKernelArg(Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret)); return ERR_OCL_API; } // States if((ret = clSetKernelArg(Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret)); return ERR_OCL_API; } @@ -1041,59 +1022,59 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar // Output if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 2); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 2); return ERR_OCL_API; } // Target if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 3); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 3); return ERR_OCL_API; } // Threads if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); - return(ERR_OCL_API); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); + return (ERR_OCL_API); } } else - { + { // Branch 0 if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret)); return ERR_OCL_API; } // Branch 1 if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret)); return ERR_OCL_API; } // Branch 2 if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret)); return ERR_OCL_API; } // Branch 3 if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret)); return ERR_OCL_API; } // Threads if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); - return(ERR_OCL_API); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret)); + return (ERR_OCL_API); } for(int i = 0; i < 4; ++i) @@ -1101,35 +1082,35 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar // States if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0); return ERR_OCL_API; } // Nonce buffer if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1); return ERR_OCL_API; } // Output if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2); return ERR_OCL_API; } // Target if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); return ERR_OCL_API; } if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); - return(ERR_OCL_API); + printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); + return (ERR_OCL_API); } } } @@ -1153,7 +1134,7 @@ uint64_t updateTimings(GpuContext* ctx, const uint64_t t) if(ctx->interleaveData->avgKernelRuntime == 0.0 || ctx->interleaveData->avgKernelRuntime > 20000.0) ctx->interleaveData->avgKernelRuntime = runtime; else - ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (runtime) * averagingBias; + ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (runtime)*averagingBias; } return runtime; } @@ -1182,7 +1163,7 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) if((dt > 0) && (dt < optimalTimeOffset)) { - delay = static_cast((optimalTimeOffset - dt)); + delay = static_cast((optimalTimeOffset - dt)); if(enableAutoAdjustment) { @@ -1201,8 +1182,7 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) // avoid that the auto adjustment is disable interleaving ctx->interleaveData->adjustThreshold = std::max( ctx->interleaveData->adjustThreshold, - 0.001 - ); + 0.001); } delay = std::max(int64_t(0), delay); @@ -1213,13 +1193,12 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) { // do not notify the user anymore if we reach a good delay if(delay > maxDelay) - printer::inst()->print_msg(L1,"OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf", + printer::inst()->print_msg(L1, "OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf", ctx->deviceIdx, ctx->idWorkerOnDevice, static_cast(delay), avgRuntime, - ctx->interleaveData->adjustThreshold * 100. - ); + ctx->interleaveData->adjustThreshold * 100.); std::this_thread::sleep_for(std::chrono::milliseconds(delay)); } @@ -1230,12 +1209,12 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner_algo) { - const auto & Kernels = ctx->Kernels[miner_algo.Id()]; + const auto& Kernels = ctx->Kernels[miner_algo.Id()]; cl_int ret; cl_uint zero = 0; size_t BranchNonces[4]; - memset(BranchNonces,0,sizeof(size_t)*4); + memset(BranchNonces, 0, sizeof(size_t) * 4); size_t g_intensity = ctx->rawIntensity; size_t w_size = ctx->workSize; @@ -1246,28 +1225,28 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner // round up to next multiple of w_size g_thd = ((g_intensity + w_size - 1u) / w_size) * w_size; // number of global threads must be a multiple of the work group size (w_size) - assert(g_thd%w_size == 0); + assert(g_thd % w_size == 0); } for(int i = 2; i < 6; ++i) { if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->ExtraBuffers[i], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to zero branch buffer counter %d.", err_to_str(ret), i - 2); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueWriteBuffer to zero branch buffer counter %d.", err_to_str(ret), i - 2); return ERR_OCL_API; } } if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_FALSE, sizeof(cl_uint) * 0xFF, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fetch results.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueWriteBuffer to fetch results.", err_to_str(ret)); return ERR_OCL_API; } - size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 }; + size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = {g_thd, 8}, lthreads[2] = {8, 8}; if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0); return ERR_OCL_API; } @@ -1279,7 +1258,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner size_t intens = g_intensity * thd; if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[7], 1, 0, &intens, &thd, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 7); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 7); return ERR_OCL_API; } @@ -1288,7 +1267,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, 0, &g_thd_cn_gpu, &w_size_cn_gpu, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); return ERR_OCL_API; } } @@ -1296,14 +1275,14 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner { if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); return ERR_OCL_API; } } if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2); return ERR_OCL_API; } @@ -1314,7 +1293,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner size_t tmpNonce = ctx->Nonce; if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); return ERR_OCL_API; } } @@ -1323,11 +1302,11 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner // this call is blocking therefore the access to the results without cl_finish is fine if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_TRUE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); + printer::inst()->print_msg(L1, "Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); return ERR_OCL_API; } - auto & numHashValues = HashOutput[0xFF]; + auto& numHashValues = HashOutput[0xFF]; // avoid out of memory read, we have only storage for 0xFF results if(numHashValues > 0xFF) numHashValues = 0xFF; diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index ae2b506db..1ba300c7a 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -1,7 +1,7 @@ #pragma once -#include "xmrstak/misc/console.hpp" #include "xmrstak/jconf.hpp" +#include "xmrstak/misc/console.hpp" #if defined(__APPLE__) #include @@ -9,13 +9,13 @@ #include #endif +#include +#include +#include +#include #include #include #include -#include -#include -#include -#include #define ERR_SUCCESS (0) #define ERR_OCL_API (2) @@ -23,13 +23,13 @@ struct InterleaveData { - std::mutex mutex; + std::mutex mutex; - double adjustThreshold = 0.4; - double startAdjustThreshold = 0.4; - double avgKernelRuntime = 0.0; - uint64_t lastRunTimeStamp = 0; - uint32_t numThreadsOnGPU = 0; + double adjustThreshold = 0.4; + double startAdjustThreshold = 0.4; + double avgKernelRuntime = 0.0; + uint64_t lastRunTimeStamp = 0; + uint32_t numThreadsOnGPU = 0; }; struct GpuContext @@ -54,8 +54,9 @@ struct GpuContext cl_mem ExtraBuffers[6]; cl_context opencl_ctx = nullptr; std::map Program; - std::map> Kernels; + std::map> Kernels; cl_program ProgramCryptonightR = nullptr; + uint64_t last_block_height = 0u; size_t freeMem; size_t maxMemPerAlloc; int computeUnits; @@ -66,148 +67,147 @@ struct GpuContext uint64_t lastDelay = 0; uint32_t Nonce; - }; namespace { - const char* err_to_str(cl_int ret) +const char* err_to_str(cl_int ret) +{ + switch(ret) { - switch(ret) - { - case CL_SUCCESS: - return "CL_SUCCESS"; - case CL_DEVICE_NOT_FOUND: - return "CL_DEVICE_NOT_FOUND"; - case CL_DEVICE_NOT_AVAILABLE: - return "CL_DEVICE_NOT_AVAILABLE"; - case CL_COMPILER_NOT_AVAILABLE: - return "CL_COMPILER_NOT_AVAILABLE"; - case CL_MEM_OBJECT_ALLOCATION_FAILURE: - return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; - case CL_OUT_OF_RESOURCES: - return "CL_OUT_OF_RESOURCES"; - case CL_OUT_OF_HOST_MEMORY: - return "CL_OUT_OF_HOST_MEMORY"; - case CL_PROFILING_INFO_NOT_AVAILABLE: - return "CL_PROFILING_INFO_NOT_AVAILABLE"; - case CL_MEM_COPY_OVERLAP: - return "CL_MEM_COPY_OVERLAP"; - case CL_IMAGE_FORMAT_MISMATCH: - return "CL_IMAGE_FORMAT_MISMATCH"; - case CL_IMAGE_FORMAT_NOT_SUPPORTED: - return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; - case CL_BUILD_PROGRAM_FAILURE: - return "CL_BUILD_PROGRAM_FAILURE"; - case CL_MAP_FAILURE: - return "CL_MAP_FAILURE"; - case CL_MISALIGNED_SUB_BUFFER_OFFSET: - return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; - case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: - return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; - #ifdef CL_VERSION_1_2 - case CL_COMPILE_PROGRAM_FAILURE: - return "CL_COMPILE_PROGRAM_FAILURE"; - case CL_LINKER_NOT_AVAILABLE: - return "CL_LINKER_NOT_AVAILABLE"; - case CL_LINK_PROGRAM_FAILURE: - return "CL_LINK_PROGRAM_FAILURE"; - case CL_DEVICE_PARTITION_FAILED: - return "CL_DEVICE_PARTITION_FAILED"; - case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: - return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; - #endif - case CL_INVALID_VALUE: - return "CL_INVALID_VALUE"; - case CL_INVALID_DEVICE_TYPE: - return "CL_INVALID_DEVICE_TYPE"; - case CL_INVALID_PLATFORM: - return "CL_INVALID_PLATFORM"; - case CL_INVALID_DEVICE: - return "CL_INVALID_DEVICE"; - case CL_INVALID_CONTEXT: - return "CL_INVALID_CONTEXT"; - case CL_INVALID_QUEUE_PROPERTIES: - return "CL_INVALID_QUEUE_PROPERTIES"; - case CL_INVALID_COMMAND_QUEUE: - return "CL_INVALID_COMMAND_QUEUE"; - case CL_INVALID_HOST_PTR: - return "CL_INVALID_HOST_PTR"; - case CL_INVALID_MEM_OBJECT: - return "CL_INVALID_MEM_OBJECT"; - case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: - return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; - case CL_INVALID_IMAGE_SIZE: - return "CL_INVALID_IMAGE_SIZE"; - case CL_INVALID_SAMPLER: - return "CL_INVALID_SAMPLER"; - case CL_INVALID_BINARY: - return "CL_INVALID_BINARY"; - case CL_INVALID_BUILD_OPTIONS: - return "CL_INVALID_BUILD_OPTIONS"; - case CL_INVALID_PROGRAM: - return "CL_INVALID_PROGRAM"; - case CL_INVALID_PROGRAM_EXECUTABLE: - return "CL_INVALID_PROGRAM_EXECUTABLE"; - case CL_INVALID_KERNEL_NAME: - return "CL_INVALID_KERNEL_NAME"; - case CL_INVALID_KERNEL_DEFINITION: - return "CL_INVALID_KERNEL_DEFINITION"; - case CL_INVALID_KERNEL: - return "CL_INVALID_KERNEL"; - case CL_INVALID_ARG_INDEX: - return "CL_INVALID_ARG_INDEX"; - case CL_INVALID_ARG_VALUE: - return "CL_INVALID_ARG_VALUE"; - case CL_INVALID_ARG_SIZE: - return "CL_INVALID_ARG_SIZE"; - case CL_INVALID_KERNEL_ARGS: - return "CL_INVALID_KERNEL_ARGS"; - case CL_INVALID_WORK_DIMENSION: - return "CL_INVALID_WORK_DIMENSION"; - case CL_INVALID_WORK_GROUP_SIZE: - return "CL_INVALID_WORK_GROUP_SIZE"; - case CL_INVALID_WORK_ITEM_SIZE: - return "CL_INVALID_WORK_ITEM_SIZE"; - case CL_INVALID_GLOBAL_OFFSET: - return "CL_INVALID_GLOBAL_OFFSET"; - case CL_INVALID_EVENT_WAIT_LIST: - return "CL_INVALID_EVENT_WAIT_LIST"; - case CL_INVALID_EVENT: - return "CL_INVALID_EVENT"; - case CL_INVALID_OPERATION: - return "CL_INVALID_OPERATION"; - case CL_INVALID_GL_OBJECT: - return "CL_INVALID_GL_OBJECT"; - case CL_INVALID_BUFFER_SIZE: - return "CL_INVALID_BUFFER_SIZE"; - case CL_INVALID_MIP_LEVEL: - return "CL_INVALID_MIP_LEVEL"; - case CL_INVALID_GLOBAL_WORK_SIZE: - return "CL_INVALID_GLOBAL_WORK_SIZE"; - case CL_INVALID_PROPERTY: - return "CL_INVALID_PROPERTY"; - #ifdef CL_VERSION_1_2 - case CL_INVALID_IMAGE_DESCRIPTOR: - return "CL_INVALID_IMAGE_DESCRIPTOR"; - case CL_INVALID_COMPILER_OPTIONS: - return "CL_INVALID_COMPILER_OPTIONS"; - case CL_INVALID_LINKER_OPTIONS: - return "CL_INVALID_LINKER_OPTIONS"; - case CL_INVALID_DEVICE_PARTITION_COUNT: - return "CL_INVALID_DEVICE_PARTITION_COUNT"; - #endif - #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2) - case CL_INVALID_PIPE_SIZE: - return "CL_INVALID_PIPE_SIZE"; - case CL_INVALID_DEVICE_QUEUE: - return "CL_INVALID_DEVICE_QUEUE"; - #endif - default: - return "UNKNOWN_ERROR"; - } + case CL_SUCCESS: + return "CL_SUCCESS"; + case CL_DEVICE_NOT_FOUND: + return "CL_DEVICE_NOT_FOUND"; + case CL_DEVICE_NOT_AVAILABLE: + return "CL_DEVICE_NOT_AVAILABLE"; + case CL_COMPILER_NOT_AVAILABLE: + return "CL_COMPILER_NOT_AVAILABLE"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case CL_OUT_OF_RESOURCES: + return "CL_OUT_OF_RESOURCES"; + case CL_OUT_OF_HOST_MEMORY: + return "CL_OUT_OF_HOST_MEMORY"; + case CL_PROFILING_INFO_NOT_AVAILABLE: + return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case CL_MEM_COPY_OVERLAP: + return "CL_MEM_COPY_OVERLAP"; + case CL_IMAGE_FORMAT_MISMATCH: + return "CL_IMAGE_FORMAT_MISMATCH"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: + return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case CL_BUILD_PROGRAM_FAILURE: + return "CL_BUILD_PROGRAM_FAILURE"; + case CL_MAP_FAILURE: + return "CL_MAP_FAILURE"; + case CL_MISALIGNED_SUB_BUFFER_OFFSET: + return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: + return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; +#ifdef CL_VERSION_1_2 + case CL_COMPILE_PROGRAM_FAILURE: + return "CL_COMPILE_PROGRAM_FAILURE"; + case CL_LINKER_NOT_AVAILABLE: + return "CL_LINKER_NOT_AVAILABLE"; + case CL_LINK_PROGRAM_FAILURE: + return "CL_LINK_PROGRAM_FAILURE"; + case CL_DEVICE_PARTITION_FAILED: + return "CL_DEVICE_PARTITION_FAILED"; + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: + return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; +#endif + case CL_INVALID_VALUE: + return "CL_INVALID_VALUE"; + case CL_INVALID_DEVICE_TYPE: + return "CL_INVALID_DEVICE_TYPE"; + case CL_INVALID_PLATFORM: + return "CL_INVALID_PLATFORM"; + case CL_INVALID_DEVICE: + return "CL_INVALID_DEVICE"; + case CL_INVALID_CONTEXT: + return "CL_INVALID_CONTEXT"; + case CL_INVALID_QUEUE_PROPERTIES: + return "CL_INVALID_QUEUE_PROPERTIES"; + case CL_INVALID_COMMAND_QUEUE: + return "CL_INVALID_COMMAND_QUEUE"; + case CL_INVALID_HOST_PTR: + return "CL_INVALID_HOST_PTR"; + case CL_INVALID_MEM_OBJECT: + return "CL_INVALID_MEM_OBJECT"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: + return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case CL_INVALID_IMAGE_SIZE: + return "CL_INVALID_IMAGE_SIZE"; + case CL_INVALID_SAMPLER: + return "CL_INVALID_SAMPLER"; + case CL_INVALID_BINARY: + return "CL_INVALID_BINARY"; + case CL_INVALID_BUILD_OPTIONS: + return "CL_INVALID_BUILD_OPTIONS"; + case CL_INVALID_PROGRAM: + return "CL_INVALID_PROGRAM"; + case CL_INVALID_PROGRAM_EXECUTABLE: + return "CL_INVALID_PROGRAM_EXECUTABLE"; + case CL_INVALID_KERNEL_NAME: + return "CL_INVALID_KERNEL_NAME"; + case CL_INVALID_KERNEL_DEFINITION: + return "CL_INVALID_KERNEL_DEFINITION"; + case CL_INVALID_KERNEL: + return "CL_INVALID_KERNEL"; + case CL_INVALID_ARG_INDEX: + return "CL_INVALID_ARG_INDEX"; + case CL_INVALID_ARG_VALUE: + return "CL_INVALID_ARG_VALUE"; + case CL_INVALID_ARG_SIZE: + return "CL_INVALID_ARG_SIZE"; + case CL_INVALID_KERNEL_ARGS: + return "CL_INVALID_KERNEL_ARGS"; + case CL_INVALID_WORK_DIMENSION: + return "CL_INVALID_WORK_DIMENSION"; + case CL_INVALID_WORK_GROUP_SIZE: + return "CL_INVALID_WORK_GROUP_SIZE"; + case CL_INVALID_WORK_ITEM_SIZE: + return "CL_INVALID_WORK_ITEM_SIZE"; + case CL_INVALID_GLOBAL_OFFSET: + return "CL_INVALID_GLOBAL_OFFSET"; + case CL_INVALID_EVENT_WAIT_LIST: + return "CL_INVALID_EVENT_WAIT_LIST"; + case CL_INVALID_EVENT: + return "CL_INVALID_EVENT"; + case CL_INVALID_OPERATION: + return "CL_INVALID_OPERATION"; + case CL_INVALID_GL_OBJECT: + return "CL_INVALID_GL_OBJECT"; + case CL_INVALID_BUFFER_SIZE: + return "CL_INVALID_BUFFER_SIZE"; + case CL_INVALID_MIP_LEVEL: + return "CL_INVALID_MIP_LEVEL"; + case CL_INVALID_GLOBAL_WORK_SIZE: + return "CL_INVALID_GLOBAL_WORK_SIZE"; + case CL_INVALID_PROPERTY: + return "CL_INVALID_PROPERTY"; +#ifdef CL_VERSION_1_2 + case CL_INVALID_IMAGE_DESCRIPTOR: + return "CL_INVALID_IMAGE_DESCRIPTOR"; + case CL_INVALID_COMPILER_OPTIONS: + return "CL_INVALID_COMPILER_OPTIONS"; + case CL_INVALID_LINKER_OPTIONS: + return "CL_INVALID_LINKER_OPTIONS"; + case CL_INVALID_DEVICE_PARTITION_COUNT: + return "CL_INVALID_DEVICE_PARTITION_COUNT"; +#endif +#if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2) + case CL_INVALID_PIPE_SIZE: + return "CL_INVALID_PIPE_SIZE"; + case CL_INVALID_DEVICE_QUEUE: + return "CL_INVALID_DEVICE_QUEUE"; +#endif + default: + return "UNKNOWN_ERROR"; } } +} // namespace uint32_t getNumPlatforms(); int getAMDPlatformIdx(); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 12478aefb..471e46a53 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -198,7 +198,7 @@ inline void keccakf1600_1(ulong st[25]) } } )===" -R"===( + R"===( void keccakf1600_2(__local ulong *st) { @@ -372,7 +372,7 @@ inline int4 _mm_alignr_epi8(int4 a, const uint rot) #endif )===" -R"===( + R"===( void CNKeccak(ulong *output, ulong *input) { @@ -416,7 +416,7 @@ void AESExpandKey256(uint *keybuf) } )===" -R"===( + R"===( #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)] @@ -577,7 +577,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, } )===" -R"===( + R"===( // __NV_CL_C_VERSION checks if NVIDIA opencl is used #if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION)) @@ -867,7 +867,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states } )===" -R"===( + R"===( __attribute__((reqd_work_group_size(8, 8, 1))) __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, @@ -1051,7 +1051,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states } )===" -R"===( + R"===( #define VSWAP8(x) (((x) >> 56) | (((x) >> 40) & 0x000000000000FF00UL) | (((x) >> 24) & 0x0000000000FF0000UL) \ | (((x) >> 8) & 0x00000000FF000000UL) | (((x) << 8) & 0x000000FF00000000UL) \ diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl index e87819760..bb37581f2 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl @@ -84,7 +84,7 @@ inline void single_comupte_wrap(const uint rot, int4 v0, int4 v1, int4 v2, int4 } )===" -R"===( + R"===( static const __constant uint look[16][4] = { {0, 1, 2, 3}, @@ -220,7 +220,7 @@ __kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, u } )===" -R"===( + R"===( static const __constant uint skip[3] = { 20,22,22 diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl similarity index 88% rename from xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl rename to xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl index 9edb774ad..cdb5aef3e 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.rtcl @@ -1,4 +1,5 @@ R"===( + /* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -15,29 +16,15 @@ R"===( * */ -#define cryptonight_r_wow 15 -#define cryptonight_r 16 - -#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT) - -#if(STRIDED_INDEX==0) -# define IDX(x) (x) -#elif(STRIDED_INDEX==1) -# define IDX(x) (mul24(((uint)(x)), Threads)) -#elif(STRIDED_INDEX==2) -# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) -#elif(STRIDED_INDEX==3) -# define IDX(x) ((x) * WORKSIZE) -#endif - +#ifndef SCRATCHPAD_CHUNK // __NV_CL_C_VERSION checks if NVIDIA opencl is used -#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION)) -# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4)))) -# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) -#else -# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)]) +# if((ALGO == cryptonight_r_wow || ALGO == cryptonight_r) && defined(__NV_CL_C_VERSION)) +# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4)))) +# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) +# else +# define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)]) +# endif #endif - __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *states, uint Threads) { @@ -162,7 +149,9 @@ __kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *stat #endif #define ROT_BITS 32 - XMRSTAK_INCLUDE_RANDOM_MATH +XMRSTAK_INCLUDE_RANDOM_MATH + +#undef ROT_BITS #if (ALGO == cryptonight_r) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl new file mode 100644 index 000000000..2c318fcbf --- /dev/null +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r_def.rtcl @@ -0,0 +1,33 @@ +R"===( +/* + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#define cryptonight_r_wow 15 +#define cryptonight_r 16 + +#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT) + +#if(STRIDED_INDEX==0) +# define IDX(x) (x) +#elif(STRIDED_INDEX==1) +# define IDX(x) (mul24(((uint)(x)), Threads)) +#elif(STRIDED_INDEX==2) +# define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) +#elif(STRIDED_INDEX==3) +# define IDX(x) ((x) * WORKSIZE) +#endif + +)===" diff --git a/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl index 22603853f..02ce53e03 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/groestl256.cl @@ -125,7 +125,7 @@ static const __constant ulong T0_G[] = }; )===" -R"===( + R"===( static const __constant ulong T4_G[] = { @@ -292,4 +292,3 @@ static const __constant ulong T4_G[] = } while (0) )===" - diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 120fb6898..dcabb3018 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -5,18 +5,18 @@ #include "autoAdjust.hpp" #include "jconf.hpp" -#include "xmrstak/misc/console.hpp" -#include "xmrstak/misc/configEditor.hpp" -#include "xmrstak/params.hpp" #include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/jconf.hpp" +#include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/params.hpp" -#include +#include #include +#include #include #include -#include -#include +#include #if defined(__APPLE__) #include @@ -24,7 +24,6 @@ #include #endif - namespace xmrstak { namespace amd @@ -32,11 +31,9 @@ namespace amd class autoAdjust { -public: - + public: autoAdjust() { - } /** print the adjusted values if needed @@ -50,18 +47,17 @@ class autoAdjust if(platformIndex == -1) { - printer::inst()->print_msg(L0,"WARNING: No AMD OpenCL platform found. Possible driver issues or wrong vendor driver."); + printer::inst()->print_msg(L0, "WARNING: No AMD OpenCL platform found. Possible driver issues or wrong vendor driver."); return false; } devVec = getAMDDevices(platformIndex); - int deviceCount = devVec.size(); if(deviceCount == 0) { - printer::inst()->print_msg(L0,"WARNING: No AMD device found."); + printer::inst()->print_msg(L0, "WARNING: No AMD device found."); return false; } @@ -69,17 +65,16 @@ class autoAdjust return true; } -private: - + private: void generateThreadConfig(const int platformIndex) { // load the template of the backend config into a char variable - const char *tpl = - #include "./config.tpl" - ; + const char* tpl = +#include "./config.tpl" + ; configEditor configTpl{}; - configTpl.set( std::string(tpl) ); + configTpl.set(std::string(tpl)); constexpr size_t byteToMiB = 1024u * 1024u; @@ -107,8 +102,7 @@ class autoAdjust // UNKNOWN ctx.name.compare("gfx900") == 0 || ctx.name.compare("gfx903") == 0 || - ctx.name.compare("gfx905") == 0 - ) + ctx.name.compare("gfx905") == 0) { /* Increase the number of threads for AMD VEGA gpus. * Limit the number of threads based on the issue: https://github.com/fireice-uk/xmr-stak/issues/5#issuecomment-339425089 @@ -119,11 +113,8 @@ class autoAdjust // NVIDIA optimizations if( - ctx.isNVIDIA && ( - ctx.name.find("P100") != std::string::npos || - ctx.name.find("V100") != std::string::npos - ) - ) + ctx.isNVIDIA && (ctx.name.find("P100") != std::string::npos || + ctx.name.find("V100") != std::string::npos)) { // do not limit the number of threads maxThreads = 40000u; @@ -190,7 +181,7 @@ class autoAdjust // 240byte extra memory is used per thread for meta data size_t perThread = hashMemSize + 240u; size_t maxIntensity = memPerThread / perThread; - size_t possibleIntensity = std::min( maxThreads , maxIntensity ); + size_t possibleIntensity = std::min(maxThreads, maxIntensity); // map intensity to a multiple of the compute unit count, 8 is the number of threads per work group size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8; // in the case we use two threads per gpu we can be relax and need no multiple of the number of compute units @@ -198,25 +189,25 @@ class autoAdjust intensity = (possibleIntensity / 8) * 8; //If the intensity is 0, then it's because the multiple of the unit count is greater than intensity - if (intensity == 0) + if(intensity == 0) { printer::inst()->print_msg(L0, "WARNING: Auto detected intensity unexpectedly low. Try to set the environment variable GPU_SINGLE_ALLOC_PERCENT."); intensity = possibleIntensity; - } - if (intensity != 0) + if(intensity != 0) { for(uint32_t thd = 0; thd < numThreads; ++thd) { conf += " // gpu: " + ctx.name + std::string(" compute units: ") + std::to_string(ctx.computeUnits) + "\n"; conf += " // memory:" + std::to_string(memPerThread / byteToMiB) + "|" + - std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" + std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n"; + std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" + std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n"; // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + - " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" - " \"unroll\" : " + std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" + - " },\n"; + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + + " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" + " \"unroll\" : " + + std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" + + " },\n"; } } else @@ -225,8 +216,8 @@ class autoAdjust } } - configTpl.replace("PLATFORMINDEX",std::to_string(platformIndex)); - configTpl.replace("GPUCONFIG",conf); + configTpl.replace("PLATFORMINDEX", std::to_string(platformIndex)); + configTpl.replace("GPUCONFIG", conf); configTpl.write(params::inst().configFileAMD); const std::string backendName = xmrstak::params::inst().openCLVendor; diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index d3dc00d01..c5a63c56f 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -21,10 +21,9 @@ * */ - #include "jconf.hpp" -#include "xmrstak/misc/jext.hpp" #include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/jext.hpp" #ifdef _WIN32 #define strcasecmp _stricmp @@ -37,7 +36,6 @@ #include #include - namespace xmrstak { namespace amd @@ -48,9 +46,14 @@ using namespace rapidjson; /* * This enum needs to match index in oConfigValues, otherwise we will get a runtime error */ -enum configEnum { aGpuThreadsConf, iPlatformIdx }; +enum configEnum +{ + aGpuThreadsConf, + iPlatformIdx +}; -struct configVal { +struct configVal +{ configEnum iName; const char* sName; Type iType; @@ -59,24 +62,25 @@ struct configVal { // Same order as in configEnum, as per comment above // kNullType means any type configVal oConfigValues[] = { - { aGpuThreadsConf, "gpu_threads_conf", kNullType }, - { iPlatformIdx, "platform_index", kNumberType } -}; - -constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); + {aGpuThreadsConf, "gpu_threads_conf", kNullType}, + {iPlatformIdx, "platform_index", kNumberType}}; +constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0])); -enum optionalConfigEnum { iAutoTune }; +enum optionalConfigEnum +{ + iAutoTune +}; -struct optionalConfigVal { +struct optionalConfigVal +{ optionalConfigEnum iName; const char* sName; Type iType; }; optionalConfigVal oOptionalConfigValues[] = { - { iAutoTune, "auto_tune", kNumberType } -}; + {iAutoTune, "auto_tune", kNumberType}}; inline bool checkType(Type have, Type want) { @@ -109,7 +113,7 @@ jconf::jconf() prv = new opaque_private(); } -bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) +bool jconf::GetThreadConfig(size_t id, thd_cfg& cfg) { if(id >= prv->configValues[aGpuThreadsConf]->Size()) return false; @@ -176,7 +180,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) return false; } - if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18 ) + if(!memChunk->IsUint64() || (int)memChunk->GetInt64() > 18) { printer::inst()->print_msg(L0, "ERROR: mem_chunk must be smaller than 18"); return false; @@ -215,7 +219,7 @@ size_t jconf::GetPlatformIdx() size_t jconf::GetAutoTune() { const Value* value = GetObjectMember(prv->jsonDoc, oOptionalConfigValues[iAutoTune].sName); - if( value != nullptr && value->IsUint64()) + if(value != nullptr && value->IsUint64()) { return value->GetUint64(); } @@ -233,22 +237,22 @@ size_t jconf::GetThreadCount() bool jconf::parse_config(const char* sFilename) { - FILE * pFile; - char * buffer; + FILE* pFile; + char* buffer; size_t flen; pFile = fopen(sFilename, "rb"); - if (pFile == NULL) + if(pFile == NULL) { printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename); return false; } - fseek(pFile,0,SEEK_END); + fseek(pFile, 0, SEEK_END); flen = ftell(pFile); rewind(pFile); - if(flen >= 64*1024) + if(flen >= 64 * 1024) { fclose(pFile); printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename); @@ -262,7 +266,7 @@ bool jconf::parse_config(const char* sFilename) } buffer = (char*)malloc(flen + 3); - if(fread(buffer+1, flen, 1, pFile) != 1) + if(fread(buffer + 1, flen, 1, pFile) != 1) { free(buffer); fclose(pFile); @@ -284,7 +288,7 @@ bool jconf::parse_config(const char* sFilename) buffer[flen] = '}'; buffer[flen + 1] = '\0'; - prv->jsonDoc.Parse(buffer, flen+2); + prv->jsonDoc.Parse(buffer, flen + 2); free(buffer); if(prv->jsonDoc.HasParseError()) @@ -294,7 +298,6 @@ bool jconf::parse_config(const char* sFilename) return false; } - if(!prv->jsonDoc.IsObject()) { //This should never happen as we created the root ourselves printer::inst()->print_msg(L0, "Invalid config file '%s'. No root?", sFilename); @@ -326,7 +329,7 @@ bool jconf::parse_config(const char* sFilename) size_t n_thd = prv->configValues[aGpuThreadsConf]->Size(); thd_cfg c; - for(size_t i=0; i < n_thd; i++) + for(size_t i = 0; i < n_thd; i++) { if(!GetThreadConfig(i, c)) { diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index 51a0c79ac..6f50c3059 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -12,16 +12,18 @@ namespace amd class jconf { -public: + public: static jconf* inst() { - if (oInst == nullptr) oInst = new jconf; + if(oInst == nullptr) + oInst = new jconf; return oInst; }; bool parse_config(const char* sFilename = params::inst().configFileAMD.c_str()); - struct thd_cfg { + struct thd_cfg + { size_t index; size_t intensity; size_t w_size; @@ -34,18 +36,17 @@ class jconf }; size_t GetThreadCount(); - bool GetThreadConfig(size_t id, thd_cfg &cfg); + bool GetThreadConfig(size_t id, thd_cfg& cfg); size_t GetAutoTune(); size_t GetPlatformIdx(); -private: + private: jconf(); static jconf* oInst; struct opaque_private; opaque_private* prv; - }; } // namespace amd diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index 3be593175..0a181154c 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -22,23 +22,23 @@ */ #include "minethd.hpp" -#include "autoAdjust.hpp" #include "amd_gpu/gpu.hpp" +#include "autoAdjust.hpp" -#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h" #include "xmrstak/backend/cpu/crypto/cryptonight.h" -#include "xmrstak/misc/configEditor.hpp" -#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h" +#include "xmrstak/backend/cpu/hwlocMemory.hpp" #include "xmrstak/backend/cpu/minethd.hpp" #include "xmrstak/jconf.hpp" -#include "xmrstak/misc/executor.hpp" +#include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/misc/console.hpp" #include "xmrstak/misc/environment.hpp" +#include "xmrstak/misc/executor.hpp" #include "xmrstak/params.hpp" -#include "xmrstak/backend/cpu/hwlocMemory.hpp" #include -#include #include +#include #include #include @@ -72,15 +72,16 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th printer::inst()->print_msg(L1, "WARNING setting affinity failed."); } -extern "C" { +extern "C" +{ #ifdef WIN32 -__declspec(dllexport) + __declspec(dllexport) #endif -std::vector* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env) -{ - environment::inst(&env); - return amd::minethd::thread_starter(threadOffset, pWork); -} + std::vector* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env) + { + environment::inst(&env); + return amd::minethd::thread_starter(threadOffset, pWork); + } } // extern "C" bool minethd::init_gpus() @@ -137,7 +138,7 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor pvThreads->reserve(n); jconf::thd_cfg cfg; - for (i = 0; i < n; i++) + for(i = 0; i < n; i++) { jconf::inst()->GetThreadConfig(i, cfg); @@ -161,7 +162,6 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor return pvThreads; } - void minethd::work_main() { if(affinity >= 0) //-1 means no affinity @@ -172,7 +172,6 @@ void minethd::work_main() lck.release(); std::this_thread::yield(); - uint64_t iCount = 0; cryptonight_ctx* cpu_ctx; cpu_ctx = cpu::minethd::minethd_alloc_ctx(); @@ -204,16 +203,16 @@ void minethd::work_main() double bestHashrate = 0.0; uint32_t bestIntensity = pGpuCtx->maxRawIntensity; - while (bQuit == 0) + while(bQuit == 0) { - if (oWork.bStall) + if(oWork.bStall) { /* We are stalled here because the executor didn't find a job for us yet, * either because of network latency, or a socket problem. Since we are * raison d'etre of this software it us sensible to just wait until we have something */ - while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) std::this_thread::sleep_for(std::chrono::milliseconds(100)); globalStates::inst().consume_work(oWork, iJobNo); @@ -267,14 +266,14 @@ void minethd::work_main() uint64_t t0 = interleaveAdjustDelay(pGpuCtx, adjustInterleave); cl_uint results[0x100]; - memset(results,0,sizeof(cl_uint)*(0x100)); + memset(results, 0, sizeof(cl_uint) * (0x100)); XMRRunJob(pGpuCtx, results, miner_algo); for(size_t i = 0; i < results[0xFF]; i++) { - uint8_t bWorkBlob[128]; - uint8_t bResult[32]; + uint8_t bWorkBlob[128]; + uint8_t bResult[32]; memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize); memset(bResult, 0, sizeof(job_result::bResult)); @@ -282,16 +281,13 @@ void minethd::work_main() *(uint32_t*)(bWorkBlob + 39) = results[i]; cpu_ctx->hash_fn(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo); - if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) + if((*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId)); else executor::inst()->push_event(ex_event("AMD Invalid Result", pGpuCtx->deviceIdx, oWork.iPoolId)); } - iCount += pGpuCtx->rawIntensity; - uint64_t iStamp = get_timestamp_ms(); - iHashCount.store(iCount, std::memory_order_relaxed); - iTimestamp.store(iStamp, std::memory_order_relaxed); + updateStats(pGpuCtx->rawIntensity, oWork.iPoolId); accRuntime += updateTimings(pGpuCtx, t0); @@ -317,20 +313,18 @@ void minethd::work_main() // lock intensity to the best values autoTune = 0; pGpuCtx->rawIntensity = bestIntensity; - printer::inst()->print_msg(L1,"OpenCL %u|%u: lock intensity at %u", + printer::inst()->print_msg(L1, "OpenCL %u|%u: lock intensity at %u", pGpuCtx->deviceIdx, pGpuCtx->idWorkerOnDevice, - bestIntensity - ); + bestIntensity); } else { - printer::inst()->print_msg(L1,"OpenCL %u|%u: auto-tune validate intensity %u|%u", + printer::inst()->print_msg(L1, "OpenCL %u|%u: auto-tune validate intensity %u|%u", pGpuCtx->deviceIdx, pGpuCtx->idWorkerOnDevice, pGpuCtx->rawIntensity, - bestIntensity - ); + bestIntensity); } // update gpu with new intensity XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo, cpu_ctx->cn_r_ctx.height); diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp index 402d63cd6..579abb1b5 100644 --- a/xmrstak/backend/amd/minethd.hpp +++ b/xmrstak/backend/amd/minethd.hpp @@ -3,27 +3,26 @@ #include "amd_gpu/gpu.hpp" #include "jconf.hpp" #include "xmrstak/backend/cpu/crypto/cryptonight.h" -#include "xmrstak/backend/miner_work.hpp" #include "xmrstak/backend/iBackend.hpp" +#include "xmrstak/backend/miner_work.hpp" #include "xmrstak/misc/environment.hpp" -#include #include #include +#include namespace xmrstak { namespace amd { -class minethd : public iBackend +class minethd : public iBackend { -public: - + public: static std::vector* thread_starter(uint32_t threadOffset, miner_work& pWork); static bool init_gpus(); -private: + private: typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&); minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg); diff --git a/xmrstak/backend/backendConnector.cpp b/xmrstak/backend/backendConnector.cpp index 0eea9fdd7..808fdca37 100644 --- a/xmrstak/backend/backendConnector.cpp +++ b/xmrstak/backend/backendConnector.cpp @@ -21,31 +21,30 @@ * */ -#include "iBackend.hpp" #include "backendConnector.hpp" -#include "miner_work.hpp" #include "globalStates.hpp" +#include "iBackend.hpp" +#include "miner_work.hpp" #include "plugin.hpp" -#include "xmrstak/misc/environment.hpp" #include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/environment.hpp" #include "xmrstak/params.hpp" #include "cpu/minethd.hpp" #ifndef CONF_NO_CUDA -# include "nvidia/minethd.hpp" +#include "nvidia/minethd.hpp" #endif #ifndef CONF_NO_OPENCL -# include "amd/minethd.hpp" +#include "amd/minethd.hpp" #endif -#include #include -#include +#include #include +#include +#include #include #include -#include - namespace xmrstak { @@ -86,7 +85,7 @@ std::vector* BackendConnector::thread_starter(miner_work& pWork) std::vector libNames = {"xmrstak_cuda_backend_cuda10_0", "xmrstak_cuda_backend_cuda9_2", "xmrstak_cuda_backend"}; size_t numWorkers = 0u; - for( const auto & name : libNames) + for(const auto& name : libNames) { printer::inst()->print_msg(L0, "NVIDIA: try to load library '%s'", name.c_str()); nvidiaplugin.load("NVIDIA", name); diff --git a/xmrstak/backend/backendConnector.hpp b/xmrstak/backend/backendConnector.hpp index 66d873e48..1f2cb8ff6 100644 --- a/xmrstak/backend/backendConnector.hpp +++ b/xmrstak/backend/backendConnector.hpp @@ -3,19 +3,18 @@ #include "iBackend.hpp" #include "miner_work.hpp" -#include -#include #include #include - +#include +#include namespace xmrstak { - struct BackendConnector - { - static std::vector* thread_starter(miner_work& pWork); - static bool self_test(); - }; +struct BackendConnector +{ + static std::vector* thread_starter(miner_work& pWork); + static bool self_test(); +}; } // namespace xmrstak diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index ba0e6984f..98c145004 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -2,12 +2,12 @@ #include "jconf.hpp" -#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/cpu/cpuType.hpp" +#include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/jconf.hpp" #include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/misc/console.hpp" #include "xmrstak/params.hpp" -#include "xmrstak/backend/cryptonight.hpp" -#include "xmrstak/backend/cpu/cpuType.hpp" #include #ifdef _WIN32 @@ -16,7 +16,6 @@ #include #endif // _WIN32 - namespace xmrstak { namespace cpu @@ -24,8 +23,7 @@ namespace cpu class autoAdjust { -public: - + public: bool printConfig() { auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); @@ -42,10 +40,10 @@ class autoAdjust configEditor configTpl{}; // load the template of the backend config into a char variable - const char *tpl = - #include "./config.tpl" - ; - configTpl.set( std::string(tpl) ); + const char* tpl = +#include "./config.tpl" + ; + configTpl.set(std::string(tpl)); std::string conf; @@ -75,14 +73,14 @@ class autoAdjust linux_layout ? "Linux" : "Windows"); uint32_t aff_id = 0; - for(uint32_t i=0; i < corecnt; i++) + for(uint32_t i = 0; i < corecnt; i++) { bool double_mode; if(L3KB_size <= 0) break; - double_mode = L3KB_size / hashMemSizeKB > (int32_t)(corecnt-i); + double_mode = L3KB_size / hashMemSizeKB > (int32_t)(corecnt - i); conf += std::string(" { \"low_power_mode\" : "); conf += std::string(double_mode ? "true" : "false"); @@ -110,14 +108,14 @@ class autoAdjust if(useCryptonight_gpu) conf += "*/\n"; - configTpl.replace("CPUCONFIG",conf); + configTpl.replace("CPUCONFIG", conf); configTpl.write(params::inst().configFileCPU); printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str()); return true; } -private: + private: bool detectL3Size() { int32_t cpu_info[4]; @@ -125,8 +123,8 @@ class autoAdjust ::jconf::cpuid(0, 0, cpu_info); memcpy(cpustr, &cpu_info[1], 4); - memcpy(cpustr+4, &cpu_info[3], 4); - memcpy(cpustr+8, &cpu_info[2], 4); + memcpy(cpustr + 4, &cpu_info[3], 4); + memcpy(cpustr + 8, &cpu_info[2], 4); if(strcmp(cpustr, "GenuineIntel") == 0) { @@ -139,7 +137,8 @@ class autoAdjust } L3KB_size = ((get_masked(cpu_info[1], 31, 22) + 1) * (get_masked(cpu_info[1], 21, 12) + 1) * - (get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / 1024; + (get_masked(cpu_info[1], 11, 0) + 1) * (cpu_info[2] + 1)) / + 1024; return true; } diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index f09b1ebc0..d1765155a 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -1,9 +1,9 @@ #pragma once -#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/misc/console.hpp" #include "xmrstak/params.hpp" -#include "xmrstak/backend/cryptonight.hpp" #ifdef _WIN32 #include @@ -16,7 +16,6 @@ #include #include - namespace xmrstak { namespace cpu @@ -24,8 +23,7 @@ namespace cpu class autoAdjust { -public: - + public: autoAdjust() { auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); @@ -48,10 +46,10 @@ class autoAdjust configEditor configTpl{}; // load the template of the backend config into a char variable - const char *tpl = - #include "./config.tpl" - ; - configTpl.set( std::string(tpl) ); + const char* tpl = +#include "./config.tpl" + ; + configTpl.set(std::string(tpl)); // if cryptonight_gpu is used we will disable cpu mining but provide a inactive config bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu; @@ -69,7 +67,7 @@ class autoAdjust results.reserve(16); findChildrenCaches(hwloc_get_root_obj(topology), - [&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); } ); + [&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); }); if(tlcs.size() == 0) throw(std::runtime_error("The CPU doesn't seem to have a cache.")); @@ -97,7 +95,7 @@ class autoAdjust if(useCryptonight_gpu) conf += "*/\n"; - configTpl.replace("CPUCONFIG",conf); + configTpl.replace("CPUCONFIG", conf); configTpl.write(params::inst().configFileCPU); printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str()); /* Destroy topology object. */ @@ -106,16 +104,16 @@ class autoAdjust return true; } -private: + private: size_t hashMemSize = 0; size_t halfHashMemSize = 0; std::vector results; - template + template inline void findChildrenByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambda) { - for(size_t i=0; i < obj->arity; i++) + for(size_t i = 0; i < obj->arity; i++) { if(obj->children[i]->type == type) lambda(obj->children[i]); @@ -133,10 +131,10 @@ class autoAdjust #endif // HWLOC_API_VERSION } - template + template inline void findChildrenCaches(hwloc_obj_t obj, func lambda) { - for(size_t i=0; i < obj->arity; i++) + for(size_t i = 0; i < obj->arity; i++) { if(isCacheObject(obj->children[i])) lambda(obj->children[i]); @@ -159,7 +157,7 @@ class autoAdjust throw(std::runtime_error("Cache object hasn't got attributes.")); size_t PUs = 0; - findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; } ); + findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; }); //Strange case, but we will handle it silently, surely there must be one PU somewhere? if(PUs == 0) @@ -172,7 +170,7 @@ class autoAdjust throw(std::runtime_error("The CPU doesn't seem to have a cache.")); //Try our luck with lower level caches - for(size_t i=0; i < obj->arity; i++) + for(size_t i = 0; i < obj->arity; i++) processTopLevelCache(obj->children[i]); return; } @@ -180,7 +178,7 @@ class autoAdjust size_t cacheSize = obj->attr->cache.size; if(isCacheExclusive(obj)) { - for(size_t i=0; i < obj->arity; i++) + for(size_t i = 0; i < obj->arity; i++) { hwloc_obj_t l2obj = obj->children[i]; //If L2 is exclusive and greater or equal to 2MB add room for one more hash @@ -191,7 +189,7 @@ class autoAdjust std::vector cores; cores.reserve(16); - findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); } ); + findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); }); size_t cacheHashes = (cacheSize + halfHashMemSize) / hashMemSize; diff --git a/xmrstak/backend/cpu/cpuType.cpp b/xmrstak/backend/cpu/cpuType.cpp index c85682d4f..5e2519c3b 100644 --- a/xmrstak/backend/cpu/cpuType.cpp +++ b/xmrstak/backend/cpu/cpuType.cpp @@ -1,9 +1,9 @@ #include "xmrstak/backend/cpu/cpuType.hpp" +#include #include #include -#include #ifdef _WIN32 #define strcasecmp _stricmp @@ -16,64 +16,63 @@ namespace xmrstak { namespace cpu { - void cpuid(uint32_t eax, int32_t ecx, int32_t val[4]) - { - std::memset(val, 0, sizeof(int32_t)*4); - - #ifdef _WIN32 - __cpuidex(val, eax, ecx); - #else - __cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]); - #endif - } - - int32_t get_masked(int32_t val, int32_t h, int32_t l) - { - val &= (0x7FFFFFFF >> (31-(h-l))) << l; - return val >> l; - } +void cpuid(uint32_t eax, int32_t ecx, int32_t val[4]) +{ + std::memset(val, 0, sizeof(int32_t) * 4); - bool has_feature(int32_t val, int32_t bit) - { - int32_t mask = 1 << bit; - return (val & mask) != 0u; +#ifdef _WIN32 + __cpuidex(val, eax, ecx); +#else + __cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]); +#endif +} - } +int32_t get_masked(int32_t val, int32_t h, int32_t l) +{ + val &= (0x7FFFFFFF >> (31 - (h - l))) << l; + return val >> l; +} - Model getModel() - { - int32_t cpu_info[4]; - char cpustr[13] = {0}; +bool has_feature(int32_t val, int32_t bit) +{ + int32_t mask = 1 << bit; + return (val & mask) != 0u; +} - cpuid(0, 0, cpu_info); - std::memcpy(cpustr, &cpu_info[1], 4); - std::memcpy(cpustr+4, &cpu_info[3], 4); - std::memcpy(cpustr+8, &cpu_info[2], 4); +Model getModel() +{ + int32_t cpu_info[4]; + char cpustr[13] = {0}; - Model result; + cpuid(0, 0, cpu_info); + std::memcpy(cpustr, &cpu_info[1], 4); + std::memcpy(cpustr + 4, &cpu_info[3], 4); + std::memcpy(cpustr + 8, &cpu_info[2], 4); - cpuid(1, 0, cpu_info); + Model result; - result.family = get_masked(cpu_info[0], 12, 8); - result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4; - result.type_name = cpustr; + cpuid(1, 0, cpu_info); - // feature bits https://en.wikipedia.org/wiki/CPUID - // sse2 - result.sse2 = has_feature(cpu_info[3], 26); - // aes-ni - result.aes = has_feature(cpu_info[2], 25); - // avx - 27 is the check if the OS overwrote cpu features - result.avx = has_feature(cpu_info[2], 28) && has_feature(cpu_info[2], 27) ; + result.family = get_masked(cpu_info[0], 12, 8); + result.model = get_masked(cpu_info[0], 8, 4) | get_masked(cpu_info[0], 20, 16) << 4; + result.type_name = cpustr; - if(strcmp(cpustr, "AuthenticAMD") == 0) - { - if(result.family == 0xF) - result.family += get_masked(cpu_info[0], 28, 20); - } + // feature bits https://en.wikipedia.org/wiki/CPUID + // sse2 + result.sse2 = has_feature(cpu_info[3], 26); + // aes-ni + result.aes = has_feature(cpu_info[2], 25); + // avx - 27 is the check if the OS overwrote cpu features + result.avx = has_feature(cpu_info[2], 28) && has_feature(cpu_info[2], 27); - return result; + if(strcmp(cpustr, "AuthenticAMD") == 0) + { + if(result.family == 0xF) + result.family += get_masked(cpu_info[0], 28, 20); } + return result; +} + } // namespace cpu } // namespace xmrstak diff --git a/xmrstak/backend/cpu/cpuType.hpp b/xmrstak/backend/cpu/cpuType.hpp index 7f6bfaf51..2bafa4105 100644 --- a/xmrstak/backend/cpu/cpuType.hpp +++ b/xmrstak/backend/cpu/cpuType.hpp @@ -1,32 +1,30 @@ #pragma once -#include #include - +#include namespace xmrstak { namespace cpu { - struct Model - { - uint32_t family = 0u; - uint32_t model = 0u; - bool aes = false; - bool sse2 = false; - bool avx = false; - std::string type_name = "unknown"; - }; +struct Model +{ + uint32_t family = 0u; + uint32_t model = 0u; + bool aes = false; + bool sse2 = false; + bool avx = false; + std::string type_name = "unknown"; +}; - Model getModel(); +Model getModel(); - /** Mask bits between h and l and return the value +/** Mask bits between h and l and return the value * * This enables us to put in values exactly like in the manual * For example EBX[30:22] is get_masked(cpu_info[1], 31, 22) */ - int32_t get_masked(int32_t val, int32_t h, int32_t l); +int32_t get_masked(int32_t val, int32_t h, int32_t l); - } // namespace cpu } // namespace xmrstak diff --git a/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp b/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp index 2fc1a8baa..5d55987ac 100644 --- a/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp +++ b/xmrstak/backend/cpu/crypto/CryptonightR_gen.cpp @@ -1,77 +1,87 @@ #include -typedef void(*void_func)(); +typedef void (*void_func)(); -#include "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.h" -#include "cryptonight_aesni.h" #include "cryptonight.h" +#include "cryptonight_aesni.h" +#include "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.h" #include "xmrstak/misc/console.hpp" -static inline void add_code(uint8_t* &p, void (*p1)(), void (*p2)()) +static inline void add_code(uint8_t*& p, void (*p1)(), void (*p2)()) { - const ptrdiff_t size = reinterpret_cast(p2) - reinterpret_cast(p1); - if (size > 0) { - memcpy(p, reinterpret_cast(p1), size); - p += size; - } + const ptrdiff_t size = reinterpret_cast(p2) - reinterpret_cast(p1); + if(size > 0) + { + memcpy(p, reinterpret_cast(p1), size); + p += size; + } } -static inline void add_random_math(uint8_t* &p, const V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, int selected_asm) +static inline void add_random_math(uint8_t*& p, const V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, int selected_asm) { - uint32_t prev_rot_src = (uint32_t)(-1); - - for (int i = 0;; ++i) { - const V4_Instruction inst = code[i]; - if (inst.opcode == RET) { - break; - } - - uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2); - uint8_t dst_index = inst.dst_index; - uint8_t src_index = inst.src_index; - - const uint32_t a = inst.dst_index; - const uint32_t b = inst.src_index; - const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS)); - - switch (inst.opcode) { - case ROR: - case ROL: - if (b != prev_rot_src) { - prev_rot_src = b; - add_code(p, instructions_mov[c], instructions_mov[c + 1]); - } - break; - } - - if (a == prev_rot_src) { - prev_rot_src = (uint32_t)(-1); - } - - void_func begin = instructions[c]; + uint32_t prev_rot_src = (uint32_t)(-1); + + for(int i = 0;; ++i) + { + const V4_Instruction inst = code[i]; + if(inst.opcode == RET) + { + break; + } + + uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2); + uint8_t dst_index = inst.dst_index; + uint8_t src_index = inst.src_index; + + const uint32_t a = inst.dst_index; + const uint32_t b = inst.src_index; + const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS)); + + switch(inst.opcode) + { + case ROR: + case ROL: + if(b != prev_rot_src) + { + prev_rot_src = b; + add_code(p, instructions_mov[c], instructions_mov[c + 1]); + } + break; + } + + if(a == prev_rot_src) + { + prev_rot_src = (uint32_t)(-1); + } + + void_func begin = instructions[c]; // AMD == 2 - if ((selected_asm == 2) && (inst.opcode == MUL && !is_64_bit)) { - // AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL - // Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41 - uint8_t* prefix = reinterpret_cast(begin); - - if (*prefix == 0x49) { - *(p++) = 0x41; - } - - begin = reinterpret_cast(prefix + 1); - } - - add_code(p, begin, instructions[c + 1]); - - if (inst.opcode == ADD) { - *(uint32_t*)(p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C; - if (is_64_bit) { - prev_rot_src = (uint32_t)(-1); - } - } - } + if((selected_asm == 2) && (inst.opcode == MUL && !is_64_bit)) + { + // AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL + // Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41 + uint8_t* prefix = reinterpret_cast(begin); + + if(*prefix == 0x49) + { + *(p++) = 0x41; + } + + begin = reinterpret_cast(prefix + 1); + } + + add_code(p, begin, instructions[c + 1]); + + if(inst.opcode == ADD) + { + *(uint32_t*)(p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C; + if(is_64_bit) + { + prev_rot_src = (uint32_t)(-1); + } + } + } } void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size) @@ -84,14 +94,14 @@ void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size) else unprotectExecutableMemory(ctx->fun_data, allocation_size); - uint8_t* p0 = ctx->fun_data; - uint8_t* p = p0; + uint8_t* p0 = ctx->fun_data; + uint8_t* p = p0; if(ctx->fun_data != nullptr) { if(N == 2) { - add_code(p, CryptonightR_template_double_part1, CryptonightR_template_double_part2); + add_code(p, CryptonightR_template_double_part1, CryptonightR_template_double_part2); add_random_math(p, ctx->cn_r_ctx.code, code_size, instructions, instructions_mov, false, ctx->asm_version); add_code(p, CryptonightR_template_double_part2, CryptonightR_template_double_part3); add_random_math(p, ctx->cn_r_ctx.code, code_size, instructions, instructions_mov, false, ctx->asm_version); diff --git a/xmrstak/backend/cpu/crypto/c_blake256.c b/xmrstak/backend/cpu/crypto/c_blake256.c index e5fadfe74..93d9cadbb 100644 --- a/xmrstak/backend/cpu/crypto/c_blake256.c +++ b/xmrstak/backend/cpu/crypto/c_blake256.c @@ -8,66 +8,67 @@ * HMAC is specified by RFC 2104. */ -#include -#include -#include #include "c_blake256.h" +#include +#include +#include -#define U8TO32(p) \ - (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ - ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) -#define U32TO8(p, v) \ - (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ - (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); +#define U8TO32(p) \ + (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]))) +#define U32TO8(p, v) \ + (p)[0] = (uint8_t)((v) >> 24); \ + (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); \ + (p)[3] = (uint8_t)((v)); const uint8_t sigma[][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}, - {14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3}, - {11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4}, - { 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8}, - { 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13}, - { 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9}, - {12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11}, - {13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10}, - { 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5}, - {10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0}, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15}, - {14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3}, - {11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4}, - { 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8} -}; + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}}; const uint32_t cst[16] = { 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89, 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, - 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 -}; + 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917}; static const uint8_t padding[] = { - 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -}; - + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -void blake256_compress(state *S, const uint8_t *block) { +void blake256_compress(state* S, const uint8_t* block) +{ uint32_t v[16], m[16], i; -#define ROT(x,n) (((x)<<(32-n))|((x)>>(n))) -#define G(a,b,c,d,e) \ - v[a] += (m[sigma[i][e]] ^ cst[sigma[i][e+1]]) + v[b]; \ - v[d] = ROT(v[d] ^ v[a],16); \ - v[c] += v[d]; \ - v[b] = ROT(v[b] ^ v[c],12); \ - v[a] += (m[sigma[i][e+1]] ^ cst[sigma[i][e]])+v[b]; \ - v[d] = ROT(v[d] ^ v[a], 8); \ - v[c] += v[d]; \ +#define ROT(x, n) (((x) << (32 - n)) | ((x) >> (n))) +#define G(a, b, c, d, e) \ + v[a] += (m[sigma[i][e]] ^ cst[sigma[i][e + 1]]) + v[b]; \ + v[d] = ROT(v[d] ^ v[a], 16); \ + v[c] += v[d]; \ + v[b] = ROT(v[b] ^ v[c], 12); \ + v[a] += (m[sigma[i][e + 1]] ^ cst[sigma[i][e]]) + v[b]; \ + v[d] = ROT(v[d] ^ v[a], 8); \ + v[c] += v[d]; \ v[b] = ROT(v[b] ^ v[c], 7); - for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4); - for (i = 0; i < 8; ++i) v[i] = S->h[i]; - v[ 8] = S->s[0] ^ 0x243F6A88; - v[ 9] = S->s[1] ^ 0x85A308D3; + for(i = 0; i < 16; ++i) + m[i] = U8TO32(block + i * 4); + for(i = 0; i < 8; ++i) + v[i] = S->h[i]; + v[8] = S->s[0] ^ 0x243F6A88; + v[9] = S->s[1] ^ 0x85A308D3; v[10] = S->s[2] ^ 0x13198A2E; v[11] = S->s[3] ^ 0x03707344; v[12] = 0xA4093822; @@ -75,29 +76,34 @@ void blake256_compress(state *S, const uint8_t *block) { v[14] = 0x082EFA98; v[15] = 0xEC4E6C89; - if (S->nullt == 0) { + if(S->nullt == 0) + { v[12] ^= S->t[0]; v[13] ^= S->t[0]; v[14] ^= S->t[1]; v[15] ^= S->t[1]; } - for (i = 0; i < 14; ++i) { - G(0, 4, 8, 12, 0); - G(1, 5, 9, 13, 2); - G(2, 6, 10, 14, 4); - G(3, 7, 11, 15, 6); - G(3, 4, 9, 14, 14); - G(2, 7, 8, 13, 12); - G(0, 5, 10, 15, 8); + for(i = 0; i < 14; ++i) + { + G(0, 4, 8, 12, 0); + G(1, 5, 9, 13, 2); + G(2, 6, 10, 14, 4); + G(3, 7, 11, 15, 6); + G(3, 4, 9, 14, 14); + G(2, 7, 8, 13, 12); + G(0, 5, 10, 15, 8); G(1, 6, 11, 12, 10); } - for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i]; - for (i = 0; i < 8; ++i) S->h[i] ^= S->s[i % 4]; + for(i = 0; i < 16; ++i) + S->h[i % 8] ^= v[i]; + for(i = 0; i < 8; ++i) + S->h[i] ^= S->s[i % 4]; } -void blake256_init(state *S) { +void blake256_init(state* S) +{ S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372; @@ -110,7 +116,8 @@ void blake256_init(state *S) { S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0; } -void blake224_init(state *S) { +void blake224_init(state* S) +{ S->h[0] = 0xC1059ED8; S->h[1] = 0x367CD507; S->h[2] = 0x3070DD17; @@ -124,57 +131,75 @@ void blake224_init(state *S) { } // datalen = number of bits -void blake256_update(state *S, const uint8_t *data, uint32_t datalen) { +void blake256_update(state* S, const uint8_t* data, uint32_t datalen) +{ int left = S->buflen >> 3; int fill = 64 - left; - if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) { - memcpy((void *) (S->buf + left), (void *) data, fill); + if(left && (((datalen >> 3) & 0x3F) >= (unsigned)fill)) + { + memcpy((void*)(S->buf + left), (void*)data, fill); S->t[0] += 512; - if (S->t[0] == 0) S->t[1]++; + if(S->t[0] == 0) + S->t[1]++; blake256_compress(S, S->buf); data += fill; datalen -= (fill << 3); left = 0; } - while (datalen >= 512) { + while(datalen >= 512) + { S->t[0] += 512; - if (S->t[0] == 0) S->t[1]++; + if(S->t[0] == 0) + S->t[1]++; blake256_compress(S, data); data += 64; datalen -= 512; } - if (datalen > 0) { - memcpy((void *) (S->buf + left), (void *) data, datalen >> 3); + if(datalen > 0) + { + memcpy((void*)(S->buf + left), (void*)data, datalen >> 3); S->buflen = (left << 3) + datalen; - } else { + } + else + { S->buflen = 0; } } // datalen = number of bits -void blake224_update(state *S, const uint8_t *data, uint32_t datalen) { +void blake224_update(state* S, const uint8_t* data, uint32_t datalen) +{ blake256_update(S, data, datalen); } -void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) { +void blake256_final_h(state* S, uint8_t* digest, uint8_t pa, uint8_t pb) +{ uint8_t msglen[8]; uint32_t lo = S->t[0] + S->buflen, hi = S->t[1]; - if (lo < (unsigned) S->buflen) hi++; + if(lo < (unsigned)S->buflen) + hi++; U32TO8(msglen + 0, hi); U32TO8(msglen + 4, lo); - if (S->buflen == 440) { /* one padding byte */ + if(S->buflen == 440) + { /* one padding byte */ S->t[0] -= 8; blake256_update(S, &pa, 8); - } else { - if (S->buflen < 440) { /* enough space to fill the block */ - if (S->buflen == 0) S->nullt = 1; + } + else + { + if(S->buflen < 440) + { /* enough space to fill the block */ + if(S->buflen == 0) + S->nullt = 1; S->t[0] -= 440 - S->buflen; blake256_update(S, padding, 440 - S->buflen); - } else { /* need 2 compressions */ + } + else + { /* need 2 compressions */ S->t[0] -= 512 - S->buflen; blake256_update(S, padding, 512 - S->buflen); S->t[0] -= 440; @@ -187,9 +212,9 @@ void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) { S->t[0] -= 64; blake256_update(S, msglen, 64); - U32TO8(digest + 0, S->h[0]); - U32TO8(digest + 4, S->h[1]); - U32TO8(digest + 8, S->h[2]); + U32TO8(digest + 0, S->h[0]); + U32TO8(digest + 4, S->h[1]); + U32TO8(digest + 8, S->h[2]); U32TO8(digest + 12, S->h[3]); U32TO8(digest + 16, S->h[4]); U32TO8(digest + 20, S->h[5]); @@ -197,16 +222,19 @@ void blake256_final_h(state *S, uint8_t *digest, uint8_t pa, uint8_t pb) { U32TO8(digest + 28, S->h[7]); } -void blake256_final(state *S, uint8_t *digest) { +void blake256_final(state* S, uint8_t* digest) +{ blake256_final_h(S, digest, 0x81, 0x01); } -void blake224_final(state *S, uint8_t *digest) { +void blake224_final(state* S, uint8_t* digest) +{ blake256_final_h(S, digest, 0x80, 0x00); } // inlen = number of bytes -void blake256_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) { +void blake256_hash(uint8_t* out, const uint8_t* in, uint32_t inlen) +{ state S; blake256_init(&S); blake256_update(&S, in, inlen * 8); @@ -214,7 +242,8 @@ void blake256_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) { } // inlen = number of bytes -void blake224_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) { +void blake224_hash(uint8_t* out, const uint8_t* in, uint32_t inlen) +{ state S; blake224_init(&S); blake224_update(&S, in, inlen * 8); @@ -222,13 +251,15 @@ void blake224_hash(uint8_t *out, const uint8_t *in, uint32_t inlen) { } // keylen = number of bytes -void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) { - const uint8_t *key = _key; +void hmac_blake256_init(hmac_state* S, const uint8_t* _key, uint64_t keylen) +{ + const uint8_t* key = _key; uint8_t keyhash[32]; uint8_t pad[64]; uint64_t i; - if (keylen > 64) { + if(keylen > 64) + { blake256_hash(keyhash, key, keylen); key = keyhash; keylen = 32; @@ -236,14 +267,16 @@ void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) { blake256_init(&S->inner); memset(pad, 0x36, 64); - for (i = 0; i < keylen; ++i) { + for(i = 0; i < keylen; ++i) + { pad[i] ^= key[i]; } blake256_update(&S->inner, pad, 512); blake256_init(&S->outer); memset(pad, 0x5c, 64); - for (i = 0; i < keylen; ++i) { + for(i = 0; i < keylen; ++i) + { pad[i] ^= key[i]; } blake256_update(&S->outer, pad, 512); @@ -252,13 +285,15 @@ void hmac_blake256_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) { } // keylen = number of bytes -void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) { - const uint8_t *key = _key; +void hmac_blake224_init(hmac_state* S, const uint8_t* _key, uint64_t keylen) +{ + const uint8_t* key = _key; uint8_t keyhash[32]; uint8_t pad[64]; uint64_t i; - if (keylen > 64) { + if(keylen > 64) + { blake256_hash(keyhash, key, keylen); key = keyhash; keylen = 28; @@ -266,14 +301,16 @@ void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) { blake224_init(&S->inner); memset(pad, 0x36, 64); - for (i = 0; i < keylen; ++i) { + for(i = 0; i < keylen; ++i) + { pad[i] ^= key[i]; } blake224_update(&S->inner, pad, 512); blake224_init(&S->outer); memset(pad, 0x5c, 64); - for (i = 0; i < keylen; ++i) { + for(i = 0; i < keylen; ++i) + { pad[i] ^= key[i]; } blake224_update(&S->outer, pad, 512); @@ -282,18 +319,21 @@ void hmac_blake224_init(hmac_state *S, const uint8_t *_key, uint64_t keylen) { } // datalen = number of bits -void hmac_blake256_update(hmac_state *S, const uint8_t *data, uint32_t datalen) { - // update the inner state - blake256_update(&S->inner, data, datalen); +void hmac_blake256_update(hmac_state* S, const uint8_t* data, uint32_t datalen) +{ + // update the inner state + blake256_update(&S->inner, data, datalen); } // datalen = number of bits -void hmac_blake224_update(hmac_state *S, const uint8_t *data, uint32_t datalen) { - // update the inner state - blake224_update(&S->inner, data, datalen); +void hmac_blake224_update(hmac_state* S, const uint8_t* data, uint32_t datalen) +{ + // update the inner state + blake224_update(&S->inner, data, datalen); } -void hmac_blake256_final(hmac_state *S, uint8_t *digest) { +void hmac_blake256_final(hmac_state* S, uint8_t* digest) +{ uint8_t ihash[32]; blake256_final(&S->inner, ihash); blake256_update(&S->outer, ihash, 256); @@ -301,7 +341,8 @@ void hmac_blake256_final(hmac_state *S, uint8_t *digest) { memset(ihash, 0, 32); } -void hmac_blake224_final(hmac_state *S, uint8_t *digest) { +void hmac_blake224_final(hmac_state* S, uint8_t* digest) +{ uint8_t ihash[32]; blake224_final(&S->inner, ihash); blake224_update(&S->outer, ihash, 224); @@ -310,7 +351,8 @@ void hmac_blake224_final(hmac_state *S, uint8_t *digest) { } // keylen = number of bytes; inlen = number of bytes -void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint32_t inlen) { +void hmac_blake256_hash(uint8_t* out, const uint8_t* key, uint64_t keylen, const uint8_t* in, uint32_t inlen) +{ hmac_state S; hmac_blake256_init(&S, key, keylen); hmac_blake256_update(&S, in, inlen * 8); @@ -318,7 +360,8 @@ void hmac_blake256_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const } // keylen = number of bytes; inlen = number of bytes -void hmac_blake224_hash(uint8_t *out, const uint8_t *key, uint64_t keylen, const uint8_t *in, uint32_t inlen) { +void hmac_blake224_hash(uint8_t* out, const uint8_t* key, uint64_t keylen, const uint8_t* in, uint32_t inlen) +{ hmac_state S; hmac_blake224_init(&S, key, keylen); hmac_blake224_update(&S, in, inlen * 8); diff --git a/xmrstak/backend/cpu/crypto/c_blake256.h b/xmrstak/backend/cpu/crypto/c_blake256.h index 06c7917af..9f63f88f4 100644 --- a/xmrstak/backend/cpu/crypto/c_blake256.h +++ b/xmrstak/backend/cpu/crypto/c_blake256.h @@ -3,41 +3,43 @@ #include -typedef struct { - uint32_t h[8], s[4], t[2]; - int buflen, nullt; - uint8_t buf[64]; +typedef struct +{ + uint32_t h[8], s[4], t[2]; + int buflen, nullt; + uint8_t buf[64]; } state; -typedef struct { - state inner; - state outer; +typedef struct +{ + state inner; + state outer; } hmac_state; -void blake256_init(state *); -void blake224_init(state *); +void blake256_init(state*); +void blake224_init(state*); -void blake256_update(state *, const uint8_t *, uint32_t); -void blake224_update(state *, const uint8_t *, uint32_t); +void blake256_update(state*, const uint8_t*, uint32_t); +void blake224_update(state*, const uint8_t*, uint32_t); -void blake256_final(state *, uint8_t *); -void blake224_final(state *, uint8_t *); +void blake256_final(state*, uint8_t*); +void blake224_final(state*, uint8_t*); -void blake256_hash(uint8_t *, const uint8_t *, uint32_t); -void blake224_hash(uint8_t *, const uint8_t *, uint32_t); +void blake256_hash(uint8_t*, const uint8_t*, uint32_t); +void blake224_hash(uint8_t*, const uint8_t*, uint32_t); /* HMAC functions: */ -void hmac_blake256_init(hmac_state *, const uint8_t *, uint64_t); -void hmac_blake224_init(hmac_state *, const uint8_t *, uint64_t); +void hmac_blake256_init(hmac_state*, const uint8_t*, uint64_t); +void hmac_blake224_init(hmac_state*, const uint8_t*, uint64_t); -void hmac_blake256_update(hmac_state *, const uint8_t *, uint32_t); -void hmac_blake224_update(hmac_state *, const uint8_t *, uint32_t); +void hmac_blake256_update(hmac_state*, const uint8_t*, uint32_t); +void hmac_blake224_update(hmac_state*, const uint8_t*, uint32_t); -void hmac_blake256_final(hmac_state *, uint8_t *); -void hmac_blake224_final(hmac_state *, uint8_t *); +void hmac_blake256_final(hmac_state*, uint8_t*); +void hmac_blake224_final(hmac_state*, uint8_t*); -void hmac_blake256_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint32_t); -void hmac_blake224_hash(uint8_t *, const uint8_t *, uint64_t, const uint8_t *, uint32_t); +void hmac_blake256_hash(uint8_t*, const uint8_t*, uint64_t, const uint8_t*, uint32_t); +void hmac_blake224_hash(uint8_t*, const uint8_t*, uint64_t, const uint8_t*, uint32_t); #endif /* _BLAKE256_H_ */ diff --git a/xmrstak/backend/cpu/crypto/c_groestl.c b/xmrstak/backend/cpu/crypto/c_groestl.c index 5b3523e79..bae9a9f11 100644 --- a/xmrstak/backend/cpu/crypto/c_groestl.c +++ b/xmrstak/backend/cpu/crypto/c_groestl.c @@ -14,178 +14,185 @@ #define P_TYPE 0 #define Q_TYPE 1 -const uint8_t shift_Values[2][8] = {{0,1,2,3,4,5,6,7},{1,3,5,7,0,2,4,6}}; - -const uint8_t indices_cyclic[15] = {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6}; - - -#define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) {temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \ - v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \ - v1 = temp_var;} - - -#define COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t) \ - tu = T[2*(uint32_t)x[4*c0+0]]; \ - tl = T[2*(uint32_t)x[4*c0+0]+1]; \ - tv1 = T[2*(uint32_t)x[4*c1+1]]; \ - tv2 = T[2*(uint32_t)x[4*c1+1]+1]; \ - ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \ - tu ^= tv1; \ - tl ^= tv2; \ - tv1 = T[2*(uint32_t)x[4*c2+2]]; \ - tv2 = T[2*(uint32_t)x[4*c2+2]+1]; \ - ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \ - tu ^= tv1; \ - tl ^= tv2; \ - tv1 = T[2*(uint32_t)x[4*c3+3]]; \ - tv2 = T[2*(uint32_t)x[4*c3+3]+1]; \ - ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \ - tu ^= tv1; \ - tl ^= tv2; \ - tl ^= T[2*(uint32_t)x[4*c4+0]]; \ - tu ^= T[2*(uint32_t)x[4*c4+0]+1]; \ - tv1 = T[2*(uint32_t)x[4*c5+1]]; \ - tv2 = T[2*(uint32_t)x[4*c5+1]+1]; \ - ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \ - tl ^= tv1; \ - tu ^= tv2; \ - tv1 = T[2*(uint32_t)x[4*c6+2]]; \ - tv2 = T[2*(uint32_t)x[4*c6+2]+1]; \ - ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \ - tl ^= tv1; \ - tu ^= tv2; \ - tv1 = T[2*(uint32_t)x[4*c7+3]]; \ - tv2 = T[2*(uint32_t)x[4*c7+3]+1]; \ - ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \ - tl ^= tv1; \ - tu ^= tv2; \ - y[i] = tu; \ - y[i+1] = tl; +const uint8_t shift_Values[2][8] = {{0, 1, 2, 3, 4, 5, 6, 7}, {1, 3, 5, 7, 0, 2, 4, 6}}; +const uint8_t indices_cyclic[15] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6}; + +#define ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) \ + { \ + temp_var = (v1 << (8 * amount_bytes)) | (v2 >> (8 * (4 - amount_bytes))); \ + v2 = (v2 << (8 * amount_bytes)) | (v1 >> (8 * (4 - amount_bytes))); \ + v1 = temp_var; \ + } + +#define COLUMN(x, y, i, c0, c1, c2, c3, c4, c5, c6, c7, tv1, tv2, tu, tl, t) \ + tu = T[2 * (uint32_t)x[4 * c0 + 0]]; \ + tl = T[2 * (uint32_t)x[4 * c0 + 0] + 1]; \ + tv1 = T[2 * (uint32_t)x[4 * c1 + 1]]; \ + tv2 = T[2 * (uint32_t)x[4 * c1 + 1] + 1]; \ + ROTATE_COLUMN_DOWN(tv1, tv2, 1, t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tv1 = T[2 * (uint32_t)x[4 * c2 + 2]]; \ + tv2 = T[2 * (uint32_t)x[4 * c2 + 2] + 1]; \ + ROTATE_COLUMN_DOWN(tv1, tv2, 2, t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tv1 = T[2 * (uint32_t)x[4 * c3 + 3]]; \ + tv2 = T[2 * (uint32_t)x[4 * c3 + 3] + 1]; \ + ROTATE_COLUMN_DOWN(tv1, tv2, 3, t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tl ^= T[2 * (uint32_t)x[4 * c4 + 0]]; \ + tu ^= T[2 * (uint32_t)x[4 * c4 + 0] + 1]; \ + tv1 = T[2 * (uint32_t)x[4 * c5 + 1]]; \ + tv2 = T[2 * (uint32_t)x[4 * c5 + 1] + 1]; \ + ROTATE_COLUMN_DOWN(tv1, tv2, 1, t) \ + tl ^= tv1; \ + tu ^= tv2; \ + tv1 = T[2 * (uint32_t)x[4 * c6 + 2]]; \ + tv2 = T[2 * (uint32_t)x[4 * c6 + 2] + 1]; \ + ROTATE_COLUMN_DOWN(tv1, tv2, 2, t) \ + tl ^= tv1; \ + tu ^= tv2; \ + tv1 = T[2 * (uint32_t)x[4 * c7 + 3]]; \ + tv2 = T[2 * (uint32_t)x[4 * c7 + 3] + 1]; \ + ROTATE_COLUMN_DOWN(tv1, tv2, 3, t) \ + tl ^= tv1; \ + tu ^= tv2; \ + y[i] = tu; \ + y[i + 1] = tl; /* compute one round of P (short variants) */ -static void RND512P(uint8_t *x, uint32_t *y, uint32_t r) { - uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; - uint32_t* x32 = (uint32_t*)x; - x32[ 0] ^= 0x00000000^r; - x32[ 2] ^= 0x00000010^r; - x32[ 4] ^= 0x00000020^r; - x32[ 6] ^= 0x00000030^r; - x32[ 8] ^= 0x00000040^r; - x32[10] ^= 0x00000050^r; - x32[12] ^= 0x00000060^r; - x32[14] ^= 0x00000070^r; - COLUMN(x,y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y,10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y,12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y,14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); +static void RND512P(uint8_t* x, uint32_t* y, uint32_t r) +{ + uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; + uint32_t* x32 = (uint32_t*)x; + x32[0] ^= 0x00000000 ^ r; + x32[2] ^= 0x00000010 ^ r; + x32[4] ^= 0x00000020 ^ r; + x32[6] ^= 0x00000030 ^ r; + x32[8] ^= 0x00000040 ^ r; + x32[10] ^= 0x00000050 ^ r; + x32[12] ^= 0x00000060 ^ r; + x32[14] ^= 0x00000070 ^ r; + COLUMN(x, y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); } /* compute one round of Q (short variants) */ -static void RND512Q(uint8_t *x, uint32_t *y, uint32_t r) { - uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; - uint32_t* x32 = (uint32_t*)x; - x32[ 0] = ~x32[ 0]; - x32[ 1] ^= 0xffffffff^r; - x32[ 2] = ~x32[ 2]; - x32[ 3] ^= 0xefffffff^r; - x32[ 4] = ~x32[ 4]; - x32[ 5] ^= 0xdfffffff^r; - x32[ 6] = ~x32[ 6]; - x32[ 7] ^= 0xcfffffff^r; - x32[ 8] = ~x32[ 8]; - x32[ 9] ^= 0xbfffffff^r; - x32[10] = ~x32[10]; - x32[11] ^= 0xafffffff^r; - x32[12] = ~x32[12]; - x32[13] ^= 0x9fffffff^r; - x32[14] = ~x32[14]; - x32[15] ^= 0x8fffffff^r; - COLUMN(x,y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y,10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y,12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - COLUMN(x,y,14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); +static void RND512Q(uint8_t* x, uint32_t* y, uint32_t r) +{ + uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; + uint32_t* x32 = (uint32_t*)x; + x32[0] = ~x32[0]; + x32[1] ^= 0xffffffff ^ r; + x32[2] = ~x32[2]; + x32[3] ^= 0xefffffff ^ r; + x32[4] = ~x32[4]; + x32[5] ^= 0xdfffffff ^ r; + x32[6] = ~x32[6]; + x32[7] ^= 0xcfffffff ^ r; + x32[8] = ~x32[8]; + x32[9] ^= 0xbfffffff ^ r; + x32[10] = ~x32[10]; + x32[11] ^= 0xafffffff ^ r; + x32[12] = ~x32[12]; + x32[13] ^= 0x9fffffff ^ r; + x32[14] = ~x32[14]; + x32[15] ^= 0x8fffffff ^ r; + COLUMN(x, y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + COLUMN(x, y, 14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); } /* compute compression function (short variants) */ -static void F512(uint32_t *h, const uint32_t *m) { - int i; - uint32_t Ptmp[2*COLS512]; - uint32_t Qtmp[2*COLS512]; - uint32_t y[2*COLS512]; - uint32_t z[2*COLS512]; - - for (i = 0; i < 2*COLS512; i++) { - z[i] = m[i]; - Ptmp[i] = h[i]^m[i]; - } - - /* compute Q(m) */ - RND512Q((uint8_t*)z, y, 0x00000000); - RND512Q((uint8_t*)y, z, 0x01000000); - RND512Q((uint8_t*)z, y, 0x02000000); - RND512Q((uint8_t*)y, z, 0x03000000); - RND512Q((uint8_t*)z, y, 0x04000000); - RND512Q((uint8_t*)y, z, 0x05000000); - RND512Q((uint8_t*)z, y, 0x06000000); - RND512Q((uint8_t*)y, z, 0x07000000); - RND512Q((uint8_t*)z, y, 0x08000000); - RND512Q((uint8_t*)y, Qtmp, 0x09000000); - - /* compute P(h+m) */ - RND512P((uint8_t*)Ptmp, y, 0x00000000); - RND512P((uint8_t*)y, z, 0x00000001); - RND512P((uint8_t*)z, y, 0x00000002); - RND512P((uint8_t*)y, z, 0x00000003); - RND512P((uint8_t*)z, y, 0x00000004); - RND512P((uint8_t*)y, z, 0x00000005); - RND512P((uint8_t*)z, y, 0x00000006); - RND512P((uint8_t*)y, z, 0x00000007); - RND512P((uint8_t*)z, y, 0x00000008); - RND512P((uint8_t*)y, Ptmp, 0x00000009); - - /* compute P(h+m) + Q(m) + h */ - for (i = 0; i < 2*COLS512; i++) { - h[i] ^= Ptmp[i]^Qtmp[i]; - } -} +static void F512(uint32_t* h, const uint32_t* m) +{ + int i; + uint32_t Ptmp[2 * COLS512]; + uint32_t Qtmp[2 * COLS512]; + uint32_t y[2 * COLS512]; + uint32_t z[2 * COLS512]; + + for(i = 0; i < 2 * COLS512; i++) + { + z[i] = m[i]; + Ptmp[i] = h[i] ^ m[i]; + } + /* compute Q(m) */ + RND512Q((uint8_t*)z, y, 0x00000000); + RND512Q((uint8_t*)y, z, 0x01000000); + RND512Q((uint8_t*)z, y, 0x02000000); + RND512Q((uint8_t*)y, z, 0x03000000); + RND512Q((uint8_t*)z, y, 0x04000000); + RND512Q((uint8_t*)y, z, 0x05000000); + RND512Q((uint8_t*)z, y, 0x06000000); + RND512Q((uint8_t*)y, z, 0x07000000); + RND512Q((uint8_t*)z, y, 0x08000000); + RND512Q((uint8_t*)y, Qtmp, 0x09000000); + + /* compute P(h+m) */ + RND512P((uint8_t*)Ptmp, y, 0x00000000); + RND512P((uint8_t*)y, z, 0x00000001); + RND512P((uint8_t*)z, y, 0x00000002); + RND512P((uint8_t*)y, z, 0x00000003); + RND512P((uint8_t*)z, y, 0x00000004); + RND512P((uint8_t*)y, z, 0x00000005); + RND512P((uint8_t*)z, y, 0x00000006); + RND512P((uint8_t*)y, z, 0x00000007); + RND512P((uint8_t*)z, y, 0x00000008); + RND512P((uint8_t*)y, Ptmp, 0x00000009); + + /* compute P(h+m) + Q(m) + h */ + for(i = 0; i < 2 * COLS512; i++) + { + h[i] ^= Ptmp[i] ^ Qtmp[i]; + } +} /* digest up to msglen bytes of input (full blocks only) */ -static void Transform(groestlHashState *ctx, - const uint8_t *input, - int msglen) { +static void Transform(groestlHashState* ctx, + const uint8_t* input, + int msglen) +{ - /* digest message, one block at a time */ - for (; msglen >= SIZE512; - msglen -= SIZE512, input += SIZE512) { - F512(ctx->chaining,(uint32_t*)input); + /* digest message, one block at a time */ + for(; msglen >= SIZE512; + msglen -= SIZE512, input += SIZE512) + { + F512(ctx->chaining, (uint32_t*)input); - /* increment block counter */ - ctx->block_counter1++; - if (ctx->block_counter1 == 0) ctx->block_counter2++; - } + /* increment block counter */ + ctx->block_counter1++; + if(ctx->block_counter1 == 0) + ctx->block_counter2++; + } } /* given state h, do h <- P(h)+h */ -static void OutputTransformation(groestlHashState *ctx) { - int j; - uint32_t temp[2*COLS512]; - uint32_t y[2*COLS512]; - uint32_t z[2*COLS512]; - - - - for (j = 0; j < 2*COLS512; j++) { - temp[j] = ctx->chaining[j]; +static void OutputTransformation(groestlHashState* ctx) +{ + int j; + uint32_t temp[2 * COLS512]; + uint32_t y[2 * COLS512]; + uint32_t z[2 * COLS512]; + + for(j = 0; j < 2 * COLS512; j++) + { + temp[j] = ctx->chaining[j]; } RND512P((uint8_t*)temp, y, 0x00000000); RND512P((uint8_t*)y, z, 0x00000001); @@ -197,75 +204,84 @@ static void OutputTransformation(groestlHashState *ctx) { RND512P((uint8_t*)y, z, 0x00000007); RND512P((uint8_t*)z, y, 0x00000008); RND512P((uint8_t*)y, temp, 0x00000009); - for (j = 0; j < 2*COLS512; j++) { - ctx->chaining[j] ^= temp[j]; + for(j = 0; j < 2 * COLS512; j++) + { + ctx->chaining[j] ^= temp[j]; } } /* initialise context */ -static void Init(groestlHashState* ctx) { - int i = 0; - /* allocate memory for state and data buffer */ - - for(;i<(SIZE512/sizeof(uint32_t));i++) - { - ctx->chaining[i] = 0; - } - - /* set initial value */ - ctx->chaining[2*COLS512-1] = u32BIG((uint32_t)HASH_BIT_LEN); - - /* set other variables */ - ctx->buf_ptr = 0; - ctx->block_counter1 = 0; - ctx->block_counter2 = 0; - ctx->bits_in_last_byte = 0; +static void Init(groestlHashState* ctx) +{ + int i = 0; + /* allocate memory for state and data buffer */ + + for(; i < (SIZE512 / sizeof(uint32_t)); i++) + { + ctx->chaining[i] = 0; + } + + /* set initial value */ + ctx->chaining[2 * COLS512 - 1] = u32BIG((uint32_t)HASH_BIT_LEN); + + /* set other variables */ + ctx->buf_ptr = 0; + ctx->block_counter1 = 0; + ctx->block_counter2 = 0; + ctx->bits_in_last_byte = 0; } /* update state with databitlen bits of input */ static void Update(groestlHashState* ctx, - const BitSequence* input, - DataLength databitlen) { - int index = 0; - int msglen = (int)(databitlen/8); - int rem = (int)(databitlen%8); + const BitSequence* input, + DataLength databitlen) +{ + int index = 0; + int msglen = (int)(databitlen / 8); + int rem = (int)(databitlen % 8); - /* if the buffer contains data that has not yet been digested, first + /* if the buffer contains data that has not yet been digested, first add data to buffer until full */ - if (ctx->buf_ptr) { - while (ctx->buf_ptr < SIZE512 && index < msglen) { - ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; - } - if (ctx->buf_ptr < SIZE512) { - /* buffer still not full, return */ - if (rem) { - ctx->bits_in_last_byte = rem; - ctx->buffer[(int)ctx->buf_ptr++] = input[index]; - } - return; + if(ctx->buf_ptr) + { + while(ctx->buf_ptr < SIZE512 && index < msglen) + { + ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; + } + if(ctx->buf_ptr < SIZE512) + { + /* buffer still not full, return */ + if(rem) + { + ctx->bits_in_last_byte = rem; + ctx->buffer[(int)ctx->buf_ptr++] = input[index]; + } + return; + } + + /* digest buffer */ + ctx->buf_ptr = 0; + Transform(ctx, ctx->buffer, SIZE512); } - /* digest buffer */ - ctx->buf_ptr = 0; - Transform(ctx, ctx->buffer, SIZE512); - } + /* digest bulk of message */ + Transform(ctx, input + index, msglen - index); + index += ((msglen - index) / SIZE512) * SIZE512; - /* digest bulk of message */ - Transform(ctx, input+index, msglen-index); - index += ((msglen-index)/SIZE512)*SIZE512; - - /* store remaining data in buffer */ - while (index < msglen) { - ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; - } + /* store remaining data in buffer */ + while(index < msglen) + { + ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; + } - /* if non-integral number of bytes have been supplied, store + /* if non-integral number of bytes have been supplied, store remaining bits in last byte, together with information about number of bits */ - if (rem) { - ctx->bits_in_last_byte = rem; - ctx->buffer[(int)ctx->buf_ptr++] = input[index]; - } + if(rem) + { + ctx->bits_in_last_byte = rem; + ctx->buffer[(int)ctx->buf_ptr++] = input[index]; + } } #define BILB ctx->bits_in_last_byte @@ -273,80 +289,92 @@ static void Update(groestlHashState* ctx, /* finalise: process remaining data (including padding), perform output transformation, and write hash result to 'output' */ static void Final(groestlHashState* ctx, - BitSequence* output) { - int i, j = 0, hashbytelen = HASH_BIT_LEN/8; - uint8_t *s = (BitSequence*)ctx->chaining; - - /* pad with '1'-bit and first few '0'-bits */ - if (BILB) { - ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-BILB); - BILB = 0; - } - else ctx->buffer[(int)ctx->buf_ptr++] = 0x80; - - /* pad with '0'-bits */ - if (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) { - /* padding requires two blocks */ - while (ctx->buf_ptr < SIZE512) { - ctx->buffer[(int)ctx->buf_ptr++] = 0; + BitSequence* output) +{ + int i, j = 0, hashbytelen = HASH_BIT_LEN / 8; + uint8_t* s = (BitSequence*)ctx->chaining; + + /* pad with '1'-bit and first few '0'-bits */ + if(BILB) + { + ctx->buffer[(int)ctx->buf_ptr - 1] &= ((1 << BILB) - 1) << (8 - BILB); + ctx->buffer[(int)ctx->buf_ptr - 1] ^= 0x1 << (7 - BILB); + BILB = 0; + } + else + ctx->buffer[(int)ctx->buf_ptr++] = 0x80; + + /* pad with '0'-bits */ + if(ctx->buf_ptr > SIZE512 - LENGTHFIELDLEN) + { + /* padding requires two blocks */ + while(ctx->buf_ptr < SIZE512) + { + ctx->buffer[(int)ctx->buf_ptr++] = 0; + } + /* digest first padding block */ + Transform(ctx, ctx->buffer, SIZE512); + ctx->buf_ptr = 0; } - /* digest first padding block */ + while(ctx->buf_ptr < SIZE512 - LENGTHFIELDLEN) + { + ctx->buffer[(int)ctx->buf_ptr++] = 0; + } + + /* length padding */ + ctx->block_counter1++; + if(ctx->block_counter1 == 0) + ctx->block_counter2++; + ctx->buf_ptr = SIZE512; + + while(ctx->buf_ptr > SIZE512 - (int)sizeof(uint32_t)) + { + ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1; + ctx->block_counter1 >>= 8; + } + while(ctx->buf_ptr > SIZE512 - LENGTHFIELDLEN) + { + ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2; + ctx->block_counter2 >>= 8; + } + /* digest final padding block */ Transform(ctx, ctx->buffer, SIZE512); - ctx->buf_ptr = 0; - } - while (ctx->buf_ptr < SIZE512-LENGTHFIELDLEN) { - ctx->buffer[(int)ctx->buf_ptr++] = 0; - } - - /* length padding */ - ctx->block_counter1++; - if (ctx->block_counter1 == 0) ctx->block_counter2++; - ctx->buf_ptr = SIZE512; - - while (ctx->buf_ptr > SIZE512-(int)sizeof(uint32_t)) { - ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1; - ctx->block_counter1 >>= 8; - } - while (ctx->buf_ptr > SIZE512-LENGTHFIELDLEN) { - ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2; - ctx->block_counter2 >>= 8; - } - /* digest final padding block */ - Transform(ctx, ctx->buffer, SIZE512); - /* perform output transformation */ - OutputTransformation(ctx); - - /* store hash result in output */ - for (i = SIZE512-hashbytelen; i < SIZE512; i++,j++) { - output[j] = s[i]; - } - - /* zeroise relevant variables and deallocate memory */ - for (i = 0; i < COLS512; i++) { - ctx->chaining[i] = 0; - } - for (i = 0; i < SIZE512; i++) { - ctx->buffer[i] = 0; - } + /* perform output transformation */ + OutputTransformation(ctx); + + /* store hash result in output */ + for(i = SIZE512 - hashbytelen; i < SIZE512; i++, j++) + { + output[j] = s[i]; + } + + /* zeroise relevant variables and deallocate memory */ + for(i = 0; i < COLS512; i++) + { + ctx->chaining[i] = 0; + } + for(i = 0; i < SIZE512; i++) + { + ctx->buffer[i] = 0; + } } /* hash bit sequence */ void groestl(const BitSequence* data, - DataLength databitlen, - BitSequence* hashval) { + DataLength databitlen, + BitSequence* hashval) +{ - groestlHashState context; + groestlHashState context; - /* initialise */ + /* initialise */ Init(&context); + /* process message */ + Update(&context, data, databitlen); - /* process message */ - Update(&context, data, databitlen); - - /* finalise */ - Final(&context, hashval); + /* finalise */ + Final(&context, hashval); } /* static int crypto_hash(unsigned char *out, diff --git a/xmrstak/backend/cpu/crypto/c_groestl.h b/xmrstak/backend/cpu/crypto/c_groestl.h index 47044b462..5322a2e2e 100644 --- a/xmrstak/backend/cpu/crypto/c_groestl.h +++ b/xmrstak/backend/cpu/crypto/c_groestl.h @@ -1,10 +1,10 @@ #ifndef __hash_h #define __hash_h /* -#include "crypto_uint8.h" +#include "crypto_hash.h" #include "crypto_uint32.h" #include "crypto_uint64.h" -#include "crypto_hash.h" +#include "crypto_uint8.h" typedef crypto_uint8 uint8_t; typedef crypto_uint32 uint32_t; @@ -19,29 +19,28 @@ typedef crypto_uint64 uint64_t; #define LENGTHFIELDLEN ROWS #define COLS512 8 -#define SIZE512 (ROWS*COLS512) +#define SIZE512 (ROWS * COLS512) #define ROUNDS512 10 #define HASH_BIT_LEN 256 -#define ROTL32(v, n) ((((v)<<(n))|((v)>>(32-(n))))&li_32(ffffffff)) - +#define ROTL32(v, n) ((((v) << (n)) | ((v) >> (32 - (n)))) & li_32(ffffffff)) #define li_32(h) 0x##h##u -#define EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n))) -#define u32BIG(a) \ - ((ROTL32(a,8) & li_32(00FF00FF)) | \ - (ROTL32(a,24) & li_32(FF00FF00))) - +#define EXT_BYTE(var, n) ((uint8_t)((uint32_t)(var) >> (8 * n))) +#define u32BIG(a) \ + ((ROTL32(a, 8) & li_32(00FF00FF)) | \ + (ROTL32(a, 24) & li_32(FF00FF00))) /* NIST API begin */ -typedef struct { - uint32_t chaining[SIZE512/sizeof(uint32_t)]; /* actual state */ - uint32_t block_counter1, - block_counter2; /* message block counter(s) */ - BitSequence buffer[SIZE512]; /* data buffer */ - int buf_ptr; /* data buffer pointer */ - int bits_in_last_byte; /* no. of message bits in last byte of +typedef struct +{ + uint32_t chaining[SIZE512 / sizeof(uint32_t)]; /* actual state */ + uint32_t block_counter1, + block_counter2; /* message block counter(s) */ + BitSequence buffer[SIZE512]; /* data buffer */ + int buf_ptr; /* data buffer pointer */ + int bits_in_last_byte; /* no. of message bits in last byte of data buffer */ } groestlHashState; diff --git a/xmrstak/backend/cpu/crypto/c_jh.c b/xmrstak/backend/cpu/crypto/c_jh.c index 0256a0fa2..e50886dee 100644 --- a/xmrstak/backend/cpu/crypto/c_jh.c +++ b/xmrstak/backend/cpu/crypto/c_jh.c @@ -23,345 +23,400 @@ typedef uint64_t uint64; /*define data alignment for different C compilers*/ #if defined(__GNUC__) - #define DATA_ALIGN16(x) x __attribute__ ((aligned(16))) +#define DATA_ALIGN16(x) x __attribute__((aligned(16))) #else - #define DATA_ALIGN16(x) __declspec(align(16)) x +#define DATA_ALIGN16(x) __declspec(align(16)) x #endif - -typedef struct { - int hashbitlen; /*the message digest size*/ - unsigned long long databitlen; /*the message size in bits*/ - unsigned long long datasize_in_buffer; /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/ - DATA_ALIGN16(uint64 x[8][2]); /*the 1024-bit state, ( x[i][0] || x[i][1] ) is the ith row of the state in the pseudocode*/ - unsigned char buffer[64]; /*the 512-bit message block to be hashed;*/ +typedef struct +{ + int hashbitlen; /*the message digest size*/ + unsigned long long databitlen; /*the message size in bits*/ + unsigned long long datasize_in_buffer; /*the size of the message remained in buffer; assumed to be multiple of 8bits except for the last partial block at the end of the message*/ + DATA_ALIGN16(uint64 x[8][2]); /*the 1024-bit state, ( x[i][0] || x[i][1] ) is the ith row of the state in the pseudocode*/ + unsigned char buffer[64]; /*the 512-bit message block to be hashed;*/ } hashState; - /*The initial hash value H(0)*/ -const unsigned char JH224_H0[128]={0x2d,0xfe,0xdd,0x62,0xf9,0x9a,0x98,0xac,0xae,0x7c,0xac,0xd6,0x19,0xd6,0x34,0xe7,0xa4,0x83,0x10,0x5,0xbc,0x30,0x12,0x16,0xb8,0x60,0x38,0xc6,0xc9,0x66,0x14,0x94,0x66,0xd9,0x89,0x9f,0x25,0x80,0x70,0x6f,0xce,0x9e,0xa3,0x1b,0x1d,0x9b,0x1a,0xdc,0x11,0xe8,0x32,0x5f,0x7b,0x36,0x6e,0x10,0xf9,0x94,0x85,0x7f,0x2,0xfa,0x6,0xc1,0x1b,0x4f,0x1b,0x5c,0xd8,0xc8,0x40,0xb3,0x97,0xf6,0xa1,0x7f,0x6e,0x73,0x80,0x99,0xdc,0xdf,0x93,0xa5,0xad,0xea,0xa3,0xd3,0xa4,0x31,0xe8,0xde,0xc9,0x53,0x9a,0x68,0x22,0xb4,0xa9,0x8a,0xec,0x86,0xa1,0xe4,0xd5,0x74,0xac,0x95,0x9c,0xe5,0x6c,0xf0,0x15,0x96,0xd,0xea,0xb5,0xab,0x2b,0xbf,0x96,0x11,0xdc,0xf0,0xdd,0x64,0xea,0x6e}; -const unsigned char JH256_H0[128]={0xeb,0x98,0xa3,0x41,0x2c,0x20,0xd3,0xeb,0x92,0xcd,0xbe,0x7b,0x9c,0xb2,0x45,0xc1,0x1c,0x93,0x51,0x91,0x60,0xd4,0xc7,0xfa,0x26,0x0,0x82,0xd6,0x7e,0x50,0x8a,0x3,0xa4,0x23,0x9e,0x26,0x77,0x26,0xb9,0x45,0xe0,0xfb,0x1a,0x48,0xd4,0x1a,0x94,0x77,0xcd,0xb5,0xab,0x26,0x2,0x6b,0x17,0x7a,0x56,0xf0,0x24,0x42,0xf,0xff,0x2f,0xa8,0x71,0xa3,0x96,0x89,0x7f,0x2e,0x4d,0x75,0x1d,0x14,0x49,0x8,0xf7,0x7d,0xe2,0x62,0x27,0x76,0x95,0xf7,0x76,0x24,0x8f,0x94,0x87,0xd5,0xb6,0x57,0x47,0x80,0x29,0x6c,0x5c,0x5e,0x27,0x2d,0xac,0x8e,0xd,0x6c,0x51,0x84,0x50,0xc6,0x57,0x5,0x7a,0xf,0x7b,0xe4,0xd3,0x67,0x70,0x24,0x12,0xea,0x89,0xe3,0xab,0x13,0xd3,0x1c,0xd7,0x69}; -const unsigned char JH384_H0[128]={0x48,0x1e,0x3b,0xc6,0xd8,0x13,0x39,0x8a,0x6d,0x3b,0x5e,0x89,0x4a,0xde,0x87,0x9b,0x63,0xfa,0xea,0x68,0xd4,0x80,0xad,0x2e,0x33,0x2c,0xcb,0x21,0x48,0xf,0x82,0x67,0x98,0xae,0xc8,0x4d,0x90,0x82,0xb9,0x28,0xd4,0x55,0xea,0x30,0x41,0x11,0x42,0x49,0x36,0xf5,0x55,0xb2,0x92,0x48,0x47,0xec,0xc7,0x25,0xa,0x93,0xba,0xf4,0x3c,0xe1,0x56,0x9b,0x7f,0x8a,0x27,0xdb,0x45,0x4c,0x9e,0xfc,0xbd,0x49,0x63,0x97,0xaf,0xe,0x58,0x9f,0xc2,0x7d,0x26,0xaa,0x80,0xcd,0x80,0xc0,0x8b,0x8c,0x9d,0xeb,0x2e,0xda,0x8a,0x79,0x81,0xe8,0xf8,0xd5,0x37,0x3a,0xf4,0x39,0x67,0xad,0xdd,0xd1,0x7a,0x71,0xa9,0xb4,0xd3,0xbd,0xa4,0x75,0xd3,0x94,0x97,0x6c,0x3f,0xba,0x98,0x42,0x73,0x7f}; -const unsigned char JH512_H0[128]={0x6f,0xd1,0x4b,0x96,0x3e,0x0,0xaa,0x17,0x63,0x6a,0x2e,0x5,0x7a,0x15,0xd5,0x43,0x8a,0x22,0x5e,0x8d,0xc,0x97,0xef,0xb,0xe9,0x34,0x12,0x59,0xf2,0xb3,0xc3,0x61,0x89,0x1d,0xa0,0xc1,0x53,0x6f,0x80,0x1e,0x2a,0xa9,0x5,0x6b,0xea,0x2b,0x6d,0x80,0x58,0x8e,0xcc,0xdb,0x20,0x75,0xba,0xa6,0xa9,0xf,0x3a,0x76,0xba,0xf8,0x3b,0xf7,0x1,0x69,0xe6,0x5,0x41,0xe3,0x4a,0x69,0x46,0xb5,0x8a,0x8e,0x2e,0x6f,0xe6,0x5a,0x10,0x47,0xa7,0xd0,0xc1,0x84,0x3c,0x24,0x3b,0x6e,0x71,0xb1,0x2d,0x5a,0xc1,0x99,0xcf,0x57,0xf6,0xec,0x9d,0xb1,0xf8,0x56,0xa7,0x6,0x88,0x7c,0x57,0x16,0xb1,0x56,0xe3,0xc2,0xfc,0xdf,0xe6,0x85,0x17,0xfb,0x54,0x5a,0x46,0x78,0xcc,0x8c,0xdd,0x4b}; +const unsigned char JH224_H0[128] = {0x2d, 0xfe, 0xdd, 0x62, 0xf9, 0x9a, 0x98, 0xac, 0xae, 0x7c, 0xac, 0xd6, 0x19, 0xd6, 0x34, 0xe7, 0xa4, 0x83, 0x10, 0x5, 0xbc, 0x30, 0x12, 0x16, 0xb8, 0x60, 0x38, 0xc6, 0xc9, 0x66, 0x14, 0x94, 0x66, 0xd9, 0x89, 0x9f, 0x25, 0x80, 0x70, 0x6f, 0xce, 0x9e, 0xa3, 0x1b, 0x1d, 0x9b, 0x1a, 0xdc, 0x11, 0xe8, 0x32, 0x5f, 0x7b, 0x36, 0x6e, 0x10, 0xf9, 0x94, 0x85, 0x7f, 0x2, 0xfa, 0x6, 0xc1, 0x1b, 0x4f, 0x1b, 0x5c, 0xd8, 0xc8, 0x40, 0xb3, 0x97, 0xf6, 0xa1, 0x7f, 0x6e, 0x73, 0x80, 0x99, 0xdc, 0xdf, 0x93, 0xa5, 0xad, 0xea, 0xa3, 0xd3, 0xa4, 0x31, 0xe8, 0xde, 0xc9, 0x53, 0x9a, 0x68, 0x22, 0xb4, 0xa9, 0x8a, 0xec, 0x86, 0xa1, 0xe4, 0xd5, 0x74, 0xac, 0x95, 0x9c, 0xe5, 0x6c, 0xf0, 0x15, 0x96, 0xd, 0xea, 0xb5, 0xab, 0x2b, 0xbf, 0x96, 0x11, 0xdc, 0xf0, 0xdd, 0x64, 0xea, 0x6e}; +const unsigned char JH256_H0[128] = {0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1, 0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3, 0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77, 0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8, 0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62, 0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c, 0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf, 0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69}; +const unsigned char JH384_H0[128] = {0x48, 0x1e, 0x3b, 0xc6, 0xd8, 0x13, 0x39, 0x8a, 0x6d, 0x3b, 0x5e, 0x89, 0x4a, 0xde, 0x87, 0x9b, 0x63, 0xfa, 0xea, 0x68, 0xd4, 0x80, 0xad, 0x2e, 0x33, 0x2c, 0xcb, 0x21, 0x48, 0xf, 0x82, 0x67, 0x98, 0xae, 0xc8, 0x4d, 0x90, 0x82, 0xb9, 0x28, 0xd4, 0x55, 0xea, 0x30, 0x41, 0x11, 0x42, 0x49, 0x36, 0xf5, 0x55, 0xb2, 0x92, 0x48, 0x47, 0xec, 0xc7, 0x25, 0xa, 0x93, 0xba, 0xf4, 0x3c, 0xe1, 0x56, 0x9b, 0x7f, 0x8a, 0x27, 0xdb, 0x45, 0x4c, 0x9e, 0xfc, 0xbd, 0x49, 0x63, 0x97, 0xaf, 0xe, 0x58, 0x9f, 0xc2, 0x7d, 0x26, 0xaa, 0x80, 0xcd, 0x80, 0xc0, 0x8b, 0x8c, 0x9d, 0xeb, 0x2e, 0xda, 0x8a, 0x79, 0x81, 0xe8, 0xf8, 0xd5, 0x37, 0x3a, 0xf4, 0x39, 0x67, 0xad, 0xdd, 0xd1, 0x7a, 0x71, 0xa9, 0xb4, 0xd3, 0xbd, 0xa4, 0x75, 0xd3, 0x94, 0x97, 0x6c, 0x3f, 0xba, 0x98, 0x42, 0x73, 0x7f}; +const unsigned char JH512_H0[128] = {0x6f, 0xd1, 0x4b, 0x96, 0x3e, 0x0, 0xaa, 0x17, 0x63, 0x6a, 0x2e, 0x5, 0x7a, 0x15, 0xd5, 0x43, 0x8a, 0x22, 0x5e, 0x8d, 0xc, 0x97, 0xef, 0xb, 0xe9, 0x34, 0x12, 0x59, 0xf2, 0xb3, 0xc3, 0x61, 0x89, 0x1d, 0xa0, 0xc1, 0x53, 0x6f, 0x80, 0x1e, 0x2a, 0xa9, 0x5, 0x6b, 0xea, 0x2b, 0x6d, 0x80, 0x58, 0x8e, 0xcc, 0xdb, 0x20, 0x75, 0xba, 0xa6, 0xa9, 0xf, 0x3a, 0x76, 0xba, 0xf8, 0x3b, 0xf7, 0x1, 0x69, 0xe6, 0x5, 0x41, 0xe3, 0x4a, 0x69, 0x46, 0xb5, 0x8a, 0x8e, 0x2e, 0x6f, 0xe6, 0x5a, 0x10, 0x47, 0xa7, 0xd0, 0xc1, 0x84, 0x3c, 0x24, 0x3b, 0x6e, 0x71, 0xb1, 0x2d, 0x5a, 0xc1, 0x99, 0xcf, 0x57, 0xf6, 0xec, 0x9d, 0xb1, 0xf8, 0x56, 0xa7, 0x6, 0x88, 0x7c, 0x57, 0x16, 0xb1, 0x56, 0xe3, 0xc2, 0xfc, 0xdf, 0xe6, 0x85, 0x17, 0xfb, 0x54, 0x5a, 0x46, 0x78, 0xcc, 0x8c, 0xdd, 0x4b}; /*42 round constants, each round constant is 32-byte (256-bit)*/ -const unsigned char E8_bitslice_roundconstant[42][32]={ -{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40}, -{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31}, -{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc}, -{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3}, -{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23}, -{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97}, -{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14}, -{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4}, -{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36}, -{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f}, -{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b}, -{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62}, -{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5}, -{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f}, -{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a}, -{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf}, -{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0}, -{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a}, -{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6}, -{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67}, -{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18}, -{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e}, -{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1}, -{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83}, -{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef}, -{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65}, -{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c}, -{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71}, -{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0}, -{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f}, -{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad}, -{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6}, -{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63}, -{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f}, -{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a}, -{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5}, -{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48}, -{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e}, -{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7}, -{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde}, -{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a}, -{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}}; - - -static void E8(hashState *state); /*The bijective function E8, in bitslice form*/ -static void F8(hashState *state); /*The compression function F8 */ +const unsigned char E8_bitslice_roundconstant[42][32] = { + {0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40}, + {0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31}, + {0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc}, + {0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3}, + {0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23}, + {0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97}, + {0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14}, + {0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4}, + {0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36}, + {0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f}, + {0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b}, + {0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62}, + {0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5}, + {0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f}, + {0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a}, + {0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf}, + {0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0}, + {0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a}, + {0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6}, + {0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67}, + {0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18}, + {0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e}, + {0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1}, + {0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83}, + {0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef}, + {0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65}, + {0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c}, + {0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71}, + {0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0}, + {0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f}, + {0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad}, + {0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6}, + {0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63}, + {0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f}, + {0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a}, + {0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5}, + {0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48}, + {0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e}, + {0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7}, + {0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde}, + {0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a}, + {0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}}; + +static void E8(hashState* state); /*The bijective function E8, in bitslice form*/ +static void F8(hashState* state); /*The compression function F8 */ /*The API functions*/ -static HashReturn Init(hashState *state, int hashbitlen); -static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen); -static HashReturn Final(hashState *state, BitSequence *hashval); -HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval); +static HashReturn Init(hashState* state, int hashbitlen); +static HashReturn Update(hashState* state, const BitSequence* data, DataLength databitlen); +static HashReturn Final(hashState* state, BitSequence* hashval); +HashReturn jh_hash(int hashbitlen, const BitSequence* data, DataLength databitlen, BitSequence* hashval); /*swapping bit 2i with bit 2i+1 of 64-bit x*/ -#define SWAP1(x) (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1)); +#define SWAP1(x) (x) = ((((x)&0x5555555555555555ULL) << 1) | (((x)&0xaaaaaaaaaaaaaaaaULL) >> 1)); /*swapping bits 4i||4i+1 with bits 4i+2||4i+3 of 64-bit x*/ -#define SWAP2(x) (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2)); +#define SWAP2(x) (x) = ((((x)&0x3333333333333333ULL) << 2) | (((x)&0xccccccccccccccccULL) >> 2)); /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of 64-bit x*/ -#define SWAP4(x) (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4)); +#define SWAP4(x) (x) = ((((x)&0x0f0f0f0f0f0f0f0fULL) << 4) | (((x)&0xf0f0f0f0f0f0f0f0ULL) >> 4)); /*swapping bits 16i||16i+1||......||16i+7 with bits 16i+8||16i+9||......||16i+15 of 64-bit x*/ -#define SWAP8(x) (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8)); +#define SWAP8(x) (x) = ((((x)&0x00ff00ff00ff00ffULL) << 8) | (((x)&0xff00ff00ff00ff00ULL) >> 8)); /*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 64-bit x*/ -#define SWAP16(x) (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16)); +#define SWAP16(x) (x) = ((((x)&0x0000ffff0000ffffULL) << 16) | (((x)&0xffff0000ffff0000ULL) >> 16)); /*swapping bits 64i||64i+1||......||64i+31 with bits 64i+32||64i+33||......||64i+63 of 64-bit x*/ -#define SWAP32(x) (x) = (((x) << 32) | ((x) >> 32)); +#define SWAP32(x) (x) = (((x) << 32) | ((x) >> 32)); /*The MDS transform*/ -#define L(m0,m1,m2,m3,m4,m5,m6,m7) \ - (m4) ^= (m1); \ - (m5) ^= (m2); \ - (m6) ^= (m0) ^ (m3); \ - (m7) ^= (m0); \ - (m0) ^= (m5); \ - (m1) ^= (m6); \ - (m2) ^= (m4) ^ (m7); \ - (m3) ^= (m4); +#define L(m0, m1, m2, m3, m4, m5, m6, m7) \ + (m4) ^= (m1); \ + (m5) ^= (m2); \ + (m6) ^= (m0) ^ (m3); \ + (m7) ^= (m0); \ + (m0) ^= (m5); \ + (m1) ^= (m6); \ + (m2) ^= (m4) ^ (m7); \ + (m3) ^= (m4); /*Two Sboxes are computed in parallel, each Sbox implements S0 and S1, selected by a constant bit*/ /*The reason to compute two Sboxes in parallel is to try to fully utilize the parallel processing power*/ -#define SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1) \ - m3 = ~(m3); \ - m7 = ~(m7); \ - m0 ^= ((~(m2)) & (cc0)); \ - m4 ^= ((~(m6)) & (cc1)); \ - temp0 = (cc0) ^ ((m0) & (m1));\ - temp1 = (cc1) ^ ((m4) & (m5));\ - m0 ^= ((m2) & (m3)); \ - m4 ^= ((m6) & (m7)); \ - m3 ^= ((~(m1)) & (m2)); \ - m7 ^= ((~(m5)) & (m6)); \ - m1 ^= ((m0) & (m2)); \ - m5 ^= ((m4) & (m6)); \ - m2 ^= ((m0) & (~(m3))); \ - m6 ^= ((m4) & (~(m7))); \ - m0 ^= ((m1) | (m3)); \ - m4 ^= ((m5) | (m7)); \ - m3 ^= ((m1) & (m2)); \ - m7 ^= ((m5) & (m6)); \ - m1 ^= (temp0 & (m0)); \ - m5 ^= (temp1 & (m4)); \ - m2 ^= temp0; \ - m6 ^= temp1; +#define SS(m0, m1, m2, m3, m4, m5, m6, m7, cc0, cc1) \ + m3 = ~(m3); \ + m7 = ~(m7); \ + m0 ^= ((~(m2)) & (cc0)); \ + m4 ^= ((~(m6)) & (cc1)); \ + temp0 = (cc0) ^ ((m0) & (m1)); \ + temp1 = (cc1) ^ ((m4) & (m5)); \ + m0 ^= ((m2) & (m3)); \ + m4 ^= ((m6) & (m7)); \ + m3 ^= ((~(m1)) & (m2)); \ + m7 ^= ((~(m5)) & (m6)); \ + m1 ^= ((m0) & (m2)); \ + m5 ^= ((m4) & (m6)); \ + m2 ^= ((m0) & (~(m3))); \ + m6 ^= ((m4) & (~(m7))); \ + m0 ^= ((m1) | (m3)); \ + m4 ^= ((m5) | (m7)); \ + m3 ^= ((m1) & (m2)); \ + m7 ^= ((m5) & (m6)); \ + m1 ^= (temp0 & (m0)); \ + m5 ^= (temp1 & (m4)); \ + m2 ^= temp0; \ + m6 ^= temp1; /*The bijective function E8, in bitslice form*/ -static void E8(hashState *state) +static void E8(hashState* state) { - uint64 i,roundnumber,temp0,temp1; - - for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7) { - /*round 7*roundnumber+0: Sbox, MDS and Swapping layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+0])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - SWAP1(state->x[1][i]); SWAP1(state->x[3][i]); SWAP1(state->x[5][i]); SWAP1(state->x[7][i]); - } - - /*round 7*roundnumber+1: Sbox, MDS and Swapping layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+1])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - SWAP2(state->x[1][i]); SWAP2(state->x[3][i]); SWAP2(state->x[5][i]); SWAP2(state->x[7][i]); - } - - /*round 7*roundnumber+2: Sbox, MDS and Swapping layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+2])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - SWAP4(state->x[1][i]); SWAP4(state->x[3][i]); SWAP4(state->x[5][i]); SWAP4(state->x[7][i]); - } - - /*round 7*roundnumber+3: Sbox, MDS and Swapping layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+3])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - SWAP8(state->x[1][i]); SWAP8(state->x[3][i]); SWAP8(state->x[5][i]); SWAP8(state->x[7][i]); - } - - /*round 7*roundnumber+4: Sbox, MDS and Swapping layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+4])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - SWAP16(state->x[1][i]); SWAP16(state->x[3][i]); SWAP16(state->x[5][i]); SWAP16(state->x[7][i]); - } - - /*round 7*roundnumber+5: Sbox, MDS and Swapping layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+5])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - SWAP32(state->x[1][i]); SWAP32(state->x[3][i]); SWAP32(state->x[5][i]); SWAP32(state->x[7][i]); - } - - /*round 7*roundnumber+6: Sbox and MDS layers*/ - for (i = 0; i < 2; i++) { - SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i],((uint64*)E8_bitslice_roundconstant[roundnumber+6])[i+2] ); - L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - } - /*round 7*roundnumber+6: swapping layer*/ - for (i = 1; i < 8; i = i+2) { - temp0 = state->x[i][0]; state->x[i][0] = state->x[i][1]; state->x[i][1] = temp0; - } - } - + uint64 i, roundnumber, temp0, temp1; + + for(roundnumber = 0; roundnumber < 42; roundnumber = roundnumber + 7) + { + /*round 7*roundnumber+0: Sbox, MDS and Swapping layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 0])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 0])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + SWAP1(state->x[1][i]); + SWAP1(state->x[3][i]); + SWAP1(state->x[5][i]); + SWAP1(state->x[7][i]); + } + + /*round 7*roundnumber+1: Sbox, MDS and Swapping layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 1])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 1])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + SWAP2(state->x[1][i]); + SWAP2(state->x[3][i]); + SWAP2(state->x[5][i]); + SWAP2(state->x[7][i]); + } + + /*round 7*roundnumber+2: Sbox, MDS and Swapping layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 2])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 2])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + SWAP4(state->x[1][i]); + SWAP4(state->x[3][i]); + SWAP4(state->x[5][i]); + SWAP4(state->x[7][i]); + } + + /*round 7*roundnumber+3: Sbox, MDS and Swapping layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 3])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 3])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + SWAP8(state->x[1][i]); + SWAP8(state->x[3][i]); + SWAP8(state->x[5][i]); + SWAP8(state->x[7][i]); + } + + /*round 7*roundnumber+4: Sbox, MDS and Swapping layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 4])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 4])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + SWAP16(state->x[1][i]); + SWAP16(state->x[3][i]); + SWAP16(state->x[5][i]); + SWAP16(state->x[7][i]); + } + + /*round 7*roundnumber+5: Sbox, MDS and Swapping layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 5])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 5])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + SWAP32(state->x[1][i]); + SWAP32(state->x[3][i]); + SWAP32(state->x[5][i]); + SWAP32(state->x[7][i]); + } + + /*round 7*roundnumber+6: Sbox and MDS layers*/ + for(i = 0; i < 2; i++) + { + SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 6])[i], ((uint64*)E8_bitslice_roundconstant[roundnumber + 6])[i + 2]); + L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + } + /*round 7*roundnumber+6: swapping layer*/ + for(i = 1; i < 8; i = i + 2) + { + temp0 = state->x[i][0]; + state->x[i][0] = state->x[i][1]; + state->x[i][1] = temp0; + } + } } /*The compression function F8 */ -static void F8(hashState *state) +static void F8(hashState* state) { - uint64 i; + uint64 i; - /*xor the 512-bit message with the fist half of the 1024-bit hash state*/ - for (i = 0; i < 8; i++) state->x[i >> 1][i & 1] ^= ((uint64*)state->buffer)[i]; + /*xor the 512-bit message with the fist half of the 1024-bit hash state*/ + for(i = 0; i < 8; i++) + state->x[i >> 1][i & 1] ^= ((uint64*)state->buffer)[i]; - /*the bijective function E8 */ - E8(state); + /*the bijective function E8 */ + E8(state); - /*xor the 512-bit message with the second half of the 1024-bit hash state*/ - for (i = 0; i < 8; i++) state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64*)state->buffer)[i]; + /*xor the 512-bit message with the second half of the 1024-bit hash state*/ + for(i = 0; i < 8; i++) + state->x[(8 + i) >> 1][(8 + i) & 1] ^= ((uint64*)state->buffer)[i]; } /*before hashing a message, initialize the hash state as H0 */ -static HashReturn Init(hashState *state, int hashbitlen) +static HashReturn Init(hashState* state, int hashbitlen) { - state->databitlen = 0; - state->datasize_in_buffer = 0; - - /*initialize the initial hash value of JH*/ - state->hashbitlen = hashbitlen; - - /*load the initial hash value into state*/ - switch (hashbitlen) - { - case 224: memcpy(state->x,JH224_H0,128); break; - case 256: memcpy(state->x,JH256_H0,128); break; - case 384: memcpy(state->x,JH384_H0,128); break; - case 512: memcpy(state->x,JH512_H0,128); break; - } - - return(SUCCESS); + state->databitlen = 0; + state->datasize_in_buffer = 0; + + /*initialize the initial hash value of JH*/ + state->hashbitlen = hashbitlen; + + /*load the initial hash value into state*/ + switch(hashbitlen) + { + case 224: + memcpy(state->x, JH224_H0, 128); + break; + case 256: + memcpy(state->x, JH256_H0, 128); + break; + case 384: + memcpy(state->x, JH384_H0, 128); + break; + case 512: + memcpy(state->x, JH512_H0, 128); + break; + } + + return (SUCCESS); } - /*hash each 512-bit message block, except the last partial block*/ -static HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen) +static HashReturn Update(hashState* state, const BitSequence* data, DataLength databitlen) { - DataLength index; /*the starting address of the data to be compressed*/ - - state->databitlen += databitlen; - index = 0; - - /*if there is remaining data in the buffer, fill it to a full message block first*/ - /*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/ - - /*There is data in the buffer, but the incoming data is insufficient for a full block*/ - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512) ) { - if ( (databitlen & 7) == 0 ) { - memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)) ; - } - else memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1) ; - state->datasize_in_buffer += databitlen; - databitlen = 0; - } - - /*There is data in the buffer, and the incoming data is sufficient for a full block*/ - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) { - memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ; - index = 64-(state->datasize_in_buffer >> 3); - databitlen = databitlen - (512 - state->datasize_in_buffer); - F8(state); - state->datasize_in_buffer = 0; - } - - /*hash the remaining full message blocks*/ - for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) { - memcpy(state->buffer, data+index, 64); - F8(state); - } - - /*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/ - if ( databitlen > 0) { - if ((databitlen & 7) == 0) - memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3); - else - memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1); - state->datasize_in_buffer = databitlen; - } - - return(SUCCESS); + DataLength index; /*the starting address of the data to be compressed*/ + + state->databitlen += databitlen; + index = 0; + + /*if there is remaining data in the buffer, fill it to a full message block first*/ + /*we assume that the size of the data in the buffer is the multiple of 8 bits if it is not at the end of a message*/ + + /*There is data in the buffer, but the incoming data is insufficient for a full block*/ + if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) < 512)) + { + if((databitlen & 7) == 0) + { + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3)); + } + else + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3) + 1); + state->datasize_in_buffer += databitlen; + databitlen = 0; + } + + /*There is data in the buffer, and the incoming data is sufficient for a full block*/ + if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) >= 512)) + { + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3)); + index = 64 - (state->datasize_in_buffer >> 3); + databitlen = databitlen - (512 - state->datasize_in_buffer); + F8(state); + state->datasize_in_buffer = 0; + } + + /*hash the remaining full message blocks*/ + for(; databitlen >= 512; index = index + 64, databitlen = databitlen - 512) + { + memcpy(state->buffer, data + index, 64); + F8(state); + } + + /*store the partial block into buffer, assume that -- if part of the last byte is not part of the message, then that part consists of 0 bits*/ + if(databitlen > 0) + { + if((databitlen & 7) == 0) + memcpy(state->buffer, data + index, (databitlen & 0x1ff) >> 3); + else + memcpy(state->buffer, data + index, ((databitlen & 0x1ff) >> 3) + 1); + state->datasize_in_buffer = databitlen; + } + + return (SUCCESS); } /*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/ -static HashReturn Final(hashState *state, BitSequence *hashval) +static HashReturn Final(hashState* state, BitSequence* hashval) { - unsigned int i; - - if ( (state->databitlen & 0x1ff) == 0 ) { - /*pad the message when databitlen is multiple of 512 bits, then process the padded block*/ - memset(state->buffer, 0, 64); - state->buffer[0] = 0x80; - state->buffer[63] = state->databitlen & 0xff; - state->buffer[62] = (state->databitlen >> 8) & 0xff; - state->buffer[61] = (state->databitlen >> 16) & 0xff; - state->buffer[60] = (state->databitlen >> 24) & 0xff; - state->buffer[59] = (state->databitlen >> 32) & 0xff; - state->buffer[58] = (state->databitlen >> 40) & 0xff; - state->buffer[57] = (state->databitlen >> 48) & 0xff; - state->buffer[56] = (state->databitlen >> 56) & 0xff; - F8(state); - } - else { - /*set the rest of the bytes in the buffer to 0*/ - if ( (state->datasize_in_buffer & 7) == 0) - for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0; - else - for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++) state->buffer[i] = 0; - - /*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/ - state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7)); - - F8(state); - memset(state->buffer, 0, 64); - state->buffer[63] = state->databitlen & 0xff; - state->buffer[62] = (state->databitlen >> 8) & 0xff; - state->buffer[61] = (state->databitlen >> 16) & 0xff; - state->buffer[60] = (state->databitlen >> 24) & 0xff; - state->buffer[59] = (state->databitlen >> 32) & 0xff; - state->buffer[58] = (state->databitlen >> 40) & 0xff; - state->buffer[57] = (state->databitlen >> 48) & 0xff; - state->buffer[56] = (state->databitlen >> 56) & 0xff; - F8(state); - } - - /*truncating the final hash value to generate the message digest*/ - switch(state->hashbitlen) { - case 224: memcpy(hashval,(unsigned char*)state->x+64+36,28); break; - case 256: memcpy(hashval,(unsigned char*)state->x+64+32,32); break; - case 384: memcpy(hashval,(unsigned char*)state->x+64+16,48); break; - case 512: memcpy(hashval,(unsigned char*)state->x+64,64); break; - } - - return(SUCCESS); + unsigned int i; + + if((state->databitlen & 0x1ff) == 0) + { + /*pad the message when databitlen is multiple of 512 bits, then process the padded block*/ + memset(state->buffer, 0, 64); + state->buffer[0] = 0x80; + state->buffer[63] = state->databitlen & 0xff; + state->buffer[62] = (state->databitlen >> 8) & 0xff; + state->buffer[61] = (state->databitlen >> 16) & 0xff; + state->buffer[60] = (state->databitlen >> 24) & 0xff; + state->buffer[59] = (state->databitlen >> 32) & 0xff; + state->buffer[58] = (state->databitlen >> 40) & 0xff; + state->buffer[57] = (state->databitlen >> 48) & 0xff; + state->buffer[56] = (state->databitlen >> 56) & 0xff; + F8(state); + } + else + { + /*set the rest of the bytes in the buffer to 0*/ + if((state->datasize_in_buffer & 7) == 0) + for(i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) + state->buffer[i] = 0; + else + for(i = ((state->databitlen & 0x1ff) >> 3) + 1; i < 64; i++) + state->buffer[i] = 0; + + /*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/ + state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7 - (state->databitlen & 7)); + + F8(state); + memset(state->buffer, 0, 64); + state->buffer[63] = state->databitlen & 0xff; + state->buffer[62] = (state->databitlen >> 8) & 0xff; + state->buffer[61] = (state->databitlen >> 16) & 0xff; + state->buffer[60] = (state->databitlen >> 24) & 0xff; + state->buffer[59] = (state->databitlen >> 32) & 0xff; + state->buffer[58] = (state->databitlen >> 40) & 0xff; + state->buffer[57] = (state->databitlen >> 48) & 0xff; + state->buffer[56] = (state->databitlen >> 56) & 0xff; + F8(state); + } + + /*truncating the final hash value to generate the message digest*/ + switch(state->hashbitlen) + { + case 224: + memcpy(hashval, (unsigned char*)state->x + 64 + 36, 28); + break; + case 256: + memcpy(hashval, (unsigned char*)state->x + 64 + 32, 32); + break; + case 384: + memcpy(hashval, (unsigned char*)state->x + 64 + 16, 48); + break; + case 512: + memcpy(hashval, (unsigned char*)state->x + 64, 64); + break; + } + + return (SUCCESS); } /* hash a message, three inputs: message digest size in bits (hashbitlen); message (data); message length in bits (databitlen) one output: message digest (hashval) */ -HashReturn jh_hash(int hashbitlen, const BitSequence *data,DataLength databitlen, BitSequence *hashval) +HashReturn jh_hash(int hashbitlen, const BitSequence* data, DataLength databitlen, BitSequence* hashval) { - hashState state; - - if ( hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512 ) { - Init(&state, hashbitlen); - Update(&state, data, databitlen); - Final(&state, hashval); - return SUCCESS; - } - else - return(BAD_HASHLEN); + hashState state; + + if(hashbitlen == 224 || hashbitlen == 256 || hashbitlen == 384 || hashbitlen == 512) + { + Init(&state, hashbitlen); + Update(&state, data, databitlen); + Final(&state, hashval); + return SUCCESS; + } + else + return (BAD_HASHLEN); } diff --git a/xmrstak/backend/cpu/crypto/c_jh.h b/xmrstak/backend/cpu/crypto/c_jh.h index d10d40fe5..34d30e6b4 100644 --- a/xmrstak/backend/cpu/crypto/c_jh.h +++ b/xmrstak/backend/cpu/crypto/c_jh.h @@ -16,4 +16,4 @@ #include "hash.h" -HashReturn jh_hash(int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval); +HashReturn jh_hash(int hashbitlen, const BitSequence* data, DataLength databitlen, BitSequence* hashval); diff --git a/xmrstak/backend/cpu/crypto/c_keccak.c b/xmrstak/backend/cpu/crypto/c_keccak.c index 63c16147d..0af6b02ef 100644 --- a/xmrstak/backend/cpu/crypto/c_keccak.c +++ b/xmrstak/backend/cpu/crypto/c_keccak.c @@ -2,8 +2,8 @@ // 19-Nov-11 Markku-Juhani O. Saarinen // A baseline Keccak (3rd round) implementation. -#include #include +#include #define HASH_DATA_AREA 136 #define KECCAK_ROUNDS 24 @@ -13,16 +13,15 @@ #endif const uint64_t keccakf_rndc[24] = -{ - 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, - 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, - 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, - 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, - 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, - 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, - 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, - 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 -}; + { + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008}; // update the state with given number of rounds @@ -31,7 +30,8 @@ void keccakf(uint64_t st[25], int rounds) int i, j, round; uint64_t t, bc[5]; - for (round = 0; round < rounds; ++round) { + for(round = 0; round < rounds; ++round) + { // Theta bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20]; @@ -40,10 +40,11 @@ void keccakf(uint64_t st[25], int rounds) bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23]; bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24]; - for (i = 0; i < 5; ++i) { + for(i = 0; i < 5; ++i) + { t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1); - st[i ] ^= t; - st[i + 5] ^= t; + st[i] ^= t; + st[i + 5] ^= t; st[i + 10] ^= t; st[i + 15] ^= t; st[i + 20] ^= t; @@ -51,81 +52,81 @@ void keccakf(uint64_t st[25], int rounds) // Rho Pi t = st[1]; - st[ 1] = ROTL64(st[ 6], 44); - st[ 6] = ROTL64(st[ 9], 20); - st[ 9] = ROTL64(st[22], 61); + st[1] = ROTL64(st[6], 44); + st[6] = ROTL64(st[9], 20); + st[9] = ROTL64(st[22], 61); st[22] = ROTL64(st[14], 39); st[14] = ROTL64(st[20], 18); - st[20] = ROTL64(st[ 2], 62); - st[ 2] = ROTL64(st[12], 43); + st[20] = ROTL64(st[2], 62); + st[2] = ROTL64(st[12], 43); st[12] = ROTL64(st[13], 25); - st[13] = ROTL64(st[19], 8); + st[13] = ROTL64(st[19], 8); st[19] = ROTL64(st[23], 56); st[23] = ROTL64(st[15], 41); - st[15] = ROTL64(st[ 4], 27); - st[ 4] = ROTL64(st[24], 14); - st[24] = ROTL64(st[21], 2); - st[21] = ROTL64(st[ 8], 55); - st[ 8] = ROTL64(st[16], 45); - st[16] = ROTL64(st[ 5], 36); - st[ 5] = ROTL64(st[ 3], 28); - st[ 3] = ROTL64(st[18], 21); + st[15] = ROTL64(st[4], 27); + st[4] = ROTL64(st[24], 14); + st[24] = ROTL64(st[21], 2); + st[21] = ROTL64(st[8], 55); + st[8] = ROTL64(st[16], 45); + st[16] = ROTL64(st[5], 36); + st[5] = ROTL64(st[3], 28); + st[3] = ROTL64(st[18], 21); st[18] = ROTL64(st[17], 15); st[17] = ROTL64(st[11], 10); - st[11] = ROTL64(st[ 7], 6); - st[ 7] = ROTL64(st[10], 3); + st[11] = ROTL64(st[7], 6); + st[7] = ROTL64(st[10], 3); st[10] = ROTL64(t, 1); // Chi // unrolled loop, where only last iteration is different j = 0; - bc[0] = st[j ]; + bc[0] = st[j]; bc[1] = st[j + 1]; - st[j ] ^= (~st[j + 1]) & st[j + 2]; + st[j] ^= (~st[j + 1]) & st[j + 2]; st[j + 1] ^= (~st[j + 2]) & st[j + 3]; st[j + 2] ^= (~st[j + 3]) & st[j + 4]; st[j + 3] ^= (~st[j + 4]) & bc[0]; st[j + 4] ^= (~bc[0]) & bc[1]; j = 5; - bc[0] = st[j ]; + bc[0] = st[j]; bc[1] = st[j + 1]; - st[j ] ^= (~st[j + 1]) & st[j + 2]; + st[j] ^= (~st[j + 1]) & st[j + 2]; st[j + 1] ^= (~st[j + 2]) & st[j + 3]; st[j + 2] ^= (~st[j + 3]) & st[j + 4]; st[j + 3] ^= (~st[j + 4]) & bc[0]; st[j + 4] ^= (~bc[0]) & bc[1]; j = 10; - bc[0] = st[j ]; + bc[0] = st[j]; bc[1] = st[j + 1]; - st[j ] ^= (~st[j + 1]) & st[j + 2]; + st[j] ^= (~st[j + 1]) & st[j + 2]; st[j + 1] ^= (~st[j + 2]) & st[j + 3]; st[j + 2] ^= (~st[j + 3]) & st[j + 4]; st[j + 3] ^= (~st[j + 4]) & bc[0]; st[j + 4] ^= (~bc[0]) & bc[1]; j = 15; - bc[0] = st[j ]; + bc[0] = st[j]; bc[1] = st[j + 1]; - st[j ] ^= (~st[j + 1]) & st[j + 2]; + st[j] ^= (~st[j + 1]) & st[j + 2]; st[j + 1] ^= (~st[j + 2]) & st[j + 3]; st[j + 2] ^= (~st[j + 3]) & st[j + 4]; st[j + 3] ^= (~st[j + 4]) & bc[0]; st[j + 4] ^= (~bc[0]) & bc[1]; j = 20; - bc[0] = st[j ]; + bc[0] = st[j]; bc[1] = st[j + 1]; bc[2] = st[j + 2]; bc[3] = st[j + 3]; bc[4] = st[j + 4]; - st[j ] ^= (~bc[1]) & bc[2]; + st[j] ^= (~bc[1]) & bc[2]; st[j + 1] ^= (~bc[2]) & bc[3]; st[j + 2] ^= (~bc[3]) & bc[4]; st[j + 3] ^= (~bc[4]) & bc[0]; @@ -139,7 +140,7 @@ void keccakf(uint64_t st[25], int rounds) // compute a keccak hash (md) of given byte length from "in" typedef uint64_t state_t[25]; -void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen) +void keccak(const uint8_t* in, int inlen, uint8_t* md, int mdlen) { state_t st; uint8_t temp[144]; @@ -150,9 +151,10 @@ void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen) memset(st, 0, sizeof(st)); - for ( ; inlen >= rsiz; inlen -= rsiz, in += rsiz) { - for (i = 0; i < rsizw; i++) - st[i] ^= ((uint64_t *) in)[i]; + for(; inlen >= rsiz; inlen -= rsiz, in += rsiz) + { + for(i = 0; i < rsizw; i++) + st[i] ^= ((uint64_t*)in)[i]; keccakf(st, KECCAK_ROUNDS); } @@ -162,15 +164,15 @@ void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen) memset(temp + inlen, 0, rsiz - inlen); temp[rsiz - 1] |= 0x80; - for (i = 0; i < rsizw; i++) - st[i] ^= ((uint64_t *) temp)[i]; + for(i = 0; i < rsizw; i++) + st[i] ^= ((uint64_t*)temp)[i]; keccakf(st, KECCAK_ROUNDS); memcpy(md, st, mdlen); } -void keccak1600(const uint8_t *in, int inlen, uint8_t *md) +void keccak1600(const uint8_t* in, int inlen, uint8_t* md) { keccak(in, inlen, md, sizeof(state_t)); } diff --git a/xmrstak/backend/cpu/crypto/c_keccak.h b/xmrstak/backend/cpu/crypto/c_keccak.h index 4f7f85729..b7a26065e 100644 --- a/xmrstak/backend/cpu/crypto/c_keccak.h +++ b/xmrstak/backend/cpu/crypto/c_keccak.h @@ -16,11 +16,11 @@ #endif // compute a keccak hash (md) of given byte length from "in" -int keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen); +int keccak(const uint8_t* in, int inlen, uint8_t* md, int mdlen); // update the state void keccakf(uint64_t st[25], int norounds); -void keccak1600(const uint8_t *in, int inlen, uint8_t *md); +void keccak1600(const uint8_t* in, int inlen, uint8_t* md); #endif diff --git a/xmrstak/backend/cpu/crypto/c_skein.c b/xmrstak/backend/cpu/crypto/c_skein.c index e2d54425f..4b8cbb388 100644 --- a/xmrstak/backend/cpu/crypto/c_skein.c +++ b/xmrstak/backend/cpu/crypto/c_skein.c @@ -8,11 +8,11 @@ ** ************************************************************************/ -#define SKEIN_PORT_CODE /* instantiate any code in skein_port.h */ +#define SKEIN_PORT_CODE /* instantiate any code in skein_port.h */ -#include /* get size_t definition */ -#include /* get the memcpy/memset functions */ -#include "c_skein.h" /* get the Skein API definitions */ +#include "c_skein.h" /* get the Skein API definitions */ +#include /* get size_t definition */ +#include /* get the memcpy/memset functions */ #define DISABLE_UNUSED 0 @@ -24,72 +24,72 @@ #define SKEIN_512_NIST_MAX_HASHBITS (512) #endif -#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */ +#define SKEIN_MODIFIER_WORDS (2) /* number of modifier (tweak) words */ -#define SKEIN_256_STATE_WORDS ( 4) -#define SKEIN_512_STATE_WORDS ( 8) -#define SKEIN1024_STATE_WORDS (16) -#define SKEIN_MAX_STATE_WORDS (16) +#define SKEIN_256_STATE_WORDS (4) +#define SKEIN_512_STATE_WORDS (8) +#define SKEIN1024_STATE_WORDS (16) +#define SKEIN_MAX_STATE_WORDS (16) -#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS) -#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) -#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS) +#define SKEIN_256_STATE_BYTES (8 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BYTES (8 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BYTES (8 * SKEIN1024_STATE_WORDS) -#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS) -#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) -#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS) +#define SKEIN_256_STATE_BITS (64 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BITS (64 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BITS (64 * SKEIN1024_STATE_WORDS) -#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS) -#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) -#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS) +#define SKEIN_256_BLOCK_BYTES (8 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_BLOCK_BYTES (8 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_BLOCK_BYTES (8 * SKEIN1024_STATE_WORDS) -#define SKEIN_RND_SPECIAL (1000u) -#define SKEIN_RND_KEY_INITIAL (SKEIN_RND_SPECIAL+0u) -#define SKEIN_RND_KEY_INJECT (SKEIN_RND_SPECIAL+1u) -#define SKEIN_RND_FEED_FWD (SKEIN_RND_SPECIAL+2u) +#define SKEIN_RND_SPECIAL (1000u) +#define SKEIN_RND_KEY_INITIAL (SKEIN_RND_SPECIAL + 0u) +#define SKEIN_RND_KEY_INJECT (SKEIN_RND_SPECIAL + 1u) +#define SKEIN_RND_FEED_FWD (SKEIN_RND_SPECIAL + 2u) typedef struct { - size_t hashBitLen; /* size of hash result, in bits */ - size_t bCnt; /* current byte count in buffer b[] */ - u64b_t T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */ + size_t hashBitLen; /* size of hash result, in bits */ + size_t bCnt; /* current byte count in buffer b[] */ + u64b_t T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */ } Skein_Ctxt_Hdr_t; -typedef struct /* 256-bit Skein hash context structure */ +typedef struct /* 256-bit Skein hash context structure */ { - Skein_Ctxt_Hdr_t h; /* common header context variables */ - u64b_t X[SKEIN_256_STATE_WORDS]; /* chaining variables */ - u08b_t b[SKEIN_256_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN_256_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN_256_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ } Skein_256_Ctxt_t; -typedef struct /* 512-bit Skein hash context structure */ +typedef struct /* 512-bit Skein hash context structure */ { - Skein_Ctxt_Hdr_t h; /* common header context variables */ - u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */ - u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ } Skein_512_Ctxt_t; -typedef struct /* 1024-bit Skein hash context structure */ +typedef struct /* 1024-bit Skein hash context structure */ { - Skein_Ctxt_Hdr_t h; /* common header context variables */ - u64b_t X[SKEIN1024_STATE_WORDS]; /* chaining variables */ - u08b_t b[SKEIN1024_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN1024_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN1024_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ } Skein1024_Ctxt_t; /* Skein APIs for (incremental) "straight hashing" */ #if SKEIN_256_NIST_MAX_HASH_BITS -static int Skein_256_Init (Skein_256_Ctxt_t *ctx, size_t hashBitLen); +static int Skein_256_Init(Skein_256_Ctxt_t* ctx, size_t hashBitLen); #endif -static int Skein_512_Init (Skein_512_Ctxt_t *ctx, size_t hashBitLen); -static int Skein1024_Init (Skein1024_Ctxt_t *ctx, size_t hashBitLen); +static int Skein_512_Init(Skein_512_Ctxt_t* ctx, size_t hashBitLen); +static int Skein1024_Init(Skein1024_Ctxt_t* ctx, size_t hashBitLen); -static int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); -static int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); -static int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); +static int Skein_256_Update(Skein_256_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt); +static int Skein_512_Update(Skein_512_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt); +static int Skein1024_Update(Skein1024_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt); -static int Skein_256_Final (Skein_256_Ctxt_t *ctx, u08b_t * hashVal); -static int Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal); -static int Skein1024_Final (Skein1024_Ctxt_t *ctx, u08b_t * hashVal); +static int Skein_256_Final(Skein_256_Ctxt_t* ctx, u08b_t* hashVal); +static int Skein_512_Final(Skein_512_Ctxt_t* ctx, u08b_t* hashVal); +static int Skein1024_Final(Skein1024_Ctxt_t* ctx, u08b_t* hashVal); /* ** Skein APIs for "extended" initialization: MAC keys, tree hashing. @@ -126,7 +126,7 @@ static int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t * hashVal); #define SKEIN_TREE_HASH (1) #endif #if 0 -#if SKEIN_TREE_HASH +#if SKEIN_TREE_HASH static int Skein_256_Output (Skein_256_Ctxt_t *ctx, u08b_t * hashVal); static int Skein_512_Output (Skein_512_Ctxt_t *ctx, u08b_t * hashVal); static int Skein1024_Output (Skein1024_Ctxt_t *ctx, u08b_t * hashVal); @@ -142,128 +142,146 @@ static int Skein1024_Output (Skein1024_Ctxt_t *ctx, u08b_t * hashVal); ******************************************************************/ /* tweak word T[1]: bit field starting positions */ -#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */ +#define SKEIN_T1_BIT(BIT) ((BIT)-64) /* offset 64 because it's the second word */ -#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) /* bits 112..118: level in hash tree */ -#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */ -#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ -#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ -#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ +#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) /* bits 112..118: level in hash tree */ +#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */ +#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ +#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ +#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ /* tweak word T[1]: flag bit definition(s) */ -#define SKEIN_T1_FLAG_FIRST (((u64b_t) 1 ) << SKEIN_T1_POS_FIRST) -#define SKEIN_T1_FLAG_FINAL (((u64b_t) 1 ) << SKEIN_T1_POS_FINAL) -#define SKEIN_T1_FLAG_BIT_PAD (((u64b_t) 1 ) << SKEIN_T1_POS_BIT_PAD) +#define SKEIN_T1_FLAG_FIRST (((u64b_t)1) << SKEIN_T1_POS_FIRST) +#define SKEIN_T1_FLAG_FINAL (((u64b_t)1) << SKEIN_T1_POS_FINAL) +#define SKEIN_T1_FLAG_BIT_PAD (((u64b_t)1) << SKEIN_T1_POS_BIT_PAD) /* tweak word T[1]: tree level bit field mask */ -#define SKEIN_T1_TREE_LVL_MASK (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL) -#define SKEIN_T1_TREE_LEVEL(n) (((u64b_t) (n)) << SKEIN_T1_POS_TREE_LVL) +#define SKEIN_T1_TREE_LVL_MASK (((u64b_t)0x7F) << SKEIN_T1_POS_TREE_LVL) +#define SKEIN_T1_TREE_LEVEL(n) (((u64b_t)(n)) << SKEIN_T1_POS_TREE_LVL) /* tweak word T[1]: block type field */ -#define SKEIN_BLK_TYPE_KEY ( 0) /* key, for MAC and KDF */ -#define SKEIN_BLK_TYPE_CFG ( 4) /* configuration block */ -#define SKEIN_BLK_TYPE_PERS ( 8) /* personalization string */ -#define SKEIN_BLK_TYPE_PK (12) /* public key (for digital signature hashing) */ -#define SKEIN_BLK_TYPE_KDF (16) /* key identifier for KDF */ -#define SKEIN_BLK_TYPE_NONCE (20) /* nonce for PRNG */ -#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ -#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ -#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */ - -#define SKEIN_T1_BLK_TYPE(T) (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) -#define SKEIN_T1_BLK_TYPE_KEY SKEIN_T1_BLK_TYPE(KEY) /* key, for MAC and KDF */ -#define SKEIN_T1_BLK_TYPE_CFG SKEIN_T1_BLK_TYPE(CFG) /* configuration block */ -#define SKEIN_T1_BLK_TYPE_PERS SKEIN_T1_BLK_TYPE(PERS) /* personalization string */ -#define SKEIN_T1_BLK_TYPE_PK SKEIN_T1_BLK_TYPE(PK) /* public key (for digital signature hashing) */ -#define SKEIN_T1_BLK_TYPE_KDF SKEIN_T1_BLK_TYPE(KDF) /* key identifier for KDF */ -#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE)/* nonce for PRNG */ -#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ -#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ -#define SKEIN_T1_BLK_TYPE_MASK SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */ - -#define SKEIN_T1_BLK_TYPE_CFG_FINAL (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL) -#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) - -#define SKEIN_VERSION (1) - -#ifndef SKEIN_ID_STRING_LE /* allow compile-time personalization */ -#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian)*/ -#endif - -#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((u64b_t) (hi32)) << 32)) -#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE) -#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) - -#define SKEIN_CFG_STR_LEN (4*8) +#define SKEIN_BLK_TYPE_KEY (0) /* key, for MAC and KDF */ +#define SKEIN_BLK_TYPE_CFG (4) /* configuration block */ +#define SKEIN_BLK_TYPE_PERS (8) /* personalization string */ +#define SKEIN_BLK_TYPE_PK (12) /* public key (for digital signature hashing) */ +#define SKEIN_BLK_TYPE_KDF (16) /* key identifier for KDF */ +#define SKEIN_BLK_TYPE_NONCE (20) /* nonce for PRNG */ +#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ +#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ +#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */ + +#define SKEIN_T1_BLK_TYPE(T) (((u64b_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) +#define SKEIN_T1_BLK_TYPE_KEY SKEIN_T1_BLK_TYPE(KEY) /* key, for MAC and KDF */ +#define SKEIN_T1_BLK_TYPE_CFG SKEIN_T1_BLK_TYPE(CFG) /* configuration block */ +#define SKEIN_T1_BLK_TYPE_PERS SKEIN_T1_BLK_TYPE(PERS) /* personalization string */ +#define SKEIN_T1_BLK_TYPE_PK SKEIN_T1_BLK_TYPE(PK) /* public key (for digital signature hashing) */ +#define SKEIN_T1_BLK_TYPE_KDF SKEIN_T1_BLK_TYPE(KDF) /* key identifier for KDF */ +#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE) /* nonce for PRNG */ +#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ +#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ +#define SKEIN_T1_BLK_TYPE_MASK SKEIN_T1_BLK_TYPE(MASK) /* field bit mask */ + +#define SKEIN_T1_BLK_TYPE_CFG_FINAL (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL) +#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) + +#define SKEIN_VERSION (1) + +#ifndef SKEIN_ID_STRING_LE /* allow compile-time personalization */ +#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian)*/ +#endif + +#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((u64b_t)(hi32)) << 32)) +#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION, SKEIN_ID_STRING_LE) +#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22) + +#define SKEIN_CFG_STR_LEN (4 * 8) /* bit field definitions in config block treeInfo word */ -#define SKEIN_CFG_TREE_LEAF_SIZE_POS ( 0) -#define SKEIN_CFG_TREE_NODE_SIZE_POS ( 8) -#define SKEIN_CFG_TREE_MAX_LEVEL_POS (16) +#define SKEIN_CFG_TREE_LEAF_SIZE_POS (0) +#define SKEIN_CFG_TREE_NODE_SIZE_POS (8) +#define SKEIN_CFG_TREE_MAX_LEVEL_POS (16) -#define SKEIN_CFG_TREE_LEAF_SIZE_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS) -#define SKEIN_CFG_TREE_NODE_SIZE_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS) -#define SKEIN_CFG_TREE_MAX_LEVEL_MSK (((u64b_t) 0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS) +#define SKEIN_CFG_TREE_LEAF_SIZE_MSK (((u64b_t)0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS) +#define SKEIN_CFG_TREE_NODE_SIZE_MSK (((u64b_t)0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS) +#define SKEIN_CFG_TREE_MAX_LEVEL_MSK (((u64b_t)0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS) -#define SKEIN_CFG_TREE_INFO(leaf,node,maxLvl) \ - ( (((u64b_t)(leaf )) << SKEIN_CFG_TREE_LEAF_SIZE_POS) | \ - (((u64b_t)(node )) << SKEIN_CFG_TREE_NODE_SIZE_POS) | \ - (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS) ) +#define SKEIN_CFG_TREE_INFO(leaf, node, maxLvl) \ + ((((u64b_t)(leaf)) << SKEIN_CFG_TREE_LEAF_SIZE_POS) | \ + (((u64b_t)(node)) << SKEIN_CFG_TREE_NODE_SIZE_POS) | \ + (((u64b_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS)) -#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0,0,0) /* use as treeInfo in InitExt() call for sequential processing */ +#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0, 0, 0) /* use as treeInfo in InitExt() call for sequential processing */ /* ** Skein macros for getting/setting tweak words, etc. ** These are useful for partial input bytes, hash tree init/update, etc. **/ -#define Skein_Get_Tweak(ctxPtr,TWK_NUM) ((ctxPtr)->h.T[TWK_NUM]) -#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);} +#define Skein_Get_Tweak(ctxPtr, TWK_NUM) ((ctxPtr)->h.T[TWK_NUM]) +#define Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal) \ + { \ + (ctxPtr)->h.T[TWK_NUM] = (tVal); \ + } -#define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr,0) -#define Skein_Get_T1(ctxPtr) Skein_Get_Tweak(ctxPtr,1) -#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0) -#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1) +#define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr, 0) +#define Skein_Get_T1(ctxPtr) Skein_Get_Tweak(ctxPtr, 1) +#define Skein_Set_T0(ctxPtr, T0) Skein_Set_Tweak(ctxPtr, 0, T0) +#define Skein_Set_T1(ctxPtr, T1) Skein_Set_Tweak(ctxPtr, 1, T1) /* set both tweak words at once */ -#define Skein_Set_T0_T1(ctxPtr,T0,T1) \ -{ \ - Skein_Set_T0(ctxPtr,(T0)); \ - Skein_Set_T1(ctxPtr,(T1)); \ -} +#define Skein_Set_T0_T1(ctxPtr, T0, T1) \ + { \ + Skein_Set_T0(ctxPtr, (T0)); \ + Skein_Set_T1(ctxPtr, (T1)); \ + } -#define Skein_Set_Type(ctxPtr,BLK_TYPE) \ - Skein_Set_T1(ctxPtr,SKEIN_T1_BLK_TYPE_##BLK_TYPE) +#define Skein_Set_Type(ctxPtr, BLK_TYPE) \ + Skein_Set_T1(ctxPtr, SKEIN_T1_BLK_TYPE_##BLK_TYPE) /* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */ -#define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \ -{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; } +#define Skein_Start_New_Type(ctxPtr, BLK_TYPE) \ + { \ + Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); \ + (ctxPtr)->h.bCnt = 0; \ + } -#define Skein_Clear_First_Flag(hdr) { (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; } -#define Skein_Set_Bit_Pad_Flag(hdr) { (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; } +#define Skein_Clear_First_Flag(hdr) \ + { \ + (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; \ + } +#define Skein_Set_Bit_Pad_Flag(hdr) \ + { \ + (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; \ + } -#define Skein_Set_Tree_Level(hdr,height) { (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);} +#define Skein_Set_Tree_Level(hdr, height) \ + { \ + (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height); \ + } /***************************************************************** ** "Internal" Skein definitions for debugging and error checking ******************************************************************/ -#define Skein_Show_Block(bits,ctx,X,blkPtr,wPtr,ksEvenPtr,ksOddPtr) -#define Skein_Show_Round(bits,ctx,r,X) -#define Skein_Show_R_Ptr(bits,ctx,r,X_ptr) -#define Skein_Show_Final(bits,ctx,cnt,outPtr) -#define Skein_Show_Key(bits,ctx,key,keyBytes) - - -#ifndef SKEIN_ERR_CHECK /* run-time checks (e.g., bad params, uninitialized context)? */ -#define Skein_Assert(x,retCode)/* default: ignore all Asserts, for performance */ +#define Skein_Show_Block(bits, ctx, X, blkPtr, wPtr, ksEvenPtr, ksOddPtr) +#define Skein_Show_Round(bits, ctx, r, X) +#define Skein_Show_R_Ptr(bits, ctx, r, X_ptr) +#define Skein_Show_Final(bits, ctx, cnt, outPtr) +#define Skein_Show_Key(bits, ctx, key, keyBytes) + +#ifndef SKEIN_ERR_CHECK /* run-time checks (e.g., bad params, uninitialized context)? */ +#define Skein_Assert(x, retCode) /* default: ignore all Asserts, for performance */ #define Skein_assert(x) -#elif defined(SKEIN_ASSERT) +#elif defined(SKEIN_ASSERT) #include -#define Skein_Assert(x,retCode) assert(x) -#define Skein_assert(x) assert(x) +#define Skein_Assert(x, retCode) assert(x) +#define Skein_assert(x) assert(x) #else #include -#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /* caller error */ -#define Skein_assert(x) assert(x) /* internal error */ +#define Skein_Assert(x, retCode) \ + { \ + if(!(x)) \ + return retCode; \ + } /* caller error */ +#define Skein_assert(x) assert(x) /* internal error */ #endif /***************************************************************** @@ -271,48 +289,135 @@ static int Skein1024_Output (Skein1024_Ctxt_t *ctx, u08b_t * hashVal); ******************************************************************/ enum { - /* Skein_256 round rotation constants */ - R_256_0_0=14, R_256_0_1=16, - R_256_1_0=52, R_256_1_1=57, - R_256_2_0=23, R_256_2_1=40, - R_256_3_0= 5, R_256_3_1=37, - R_256_4_0=25, R_256_4_1=33, - R_256_5_0=46, R_256_5_1=12, - R_256_6_0=58, R_256_6_1=22, - R_256_7_0=32, R_256_7_1=32, - - /* Skein_512 round rotation constants */ - R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37, - R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42, - R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39, - R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56, - R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24, - R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17, - R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43, - R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22, - - /* Skein1024 round rotation constants */ - R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37, - R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52, - R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17, - R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25, - R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30, - R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41, - R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25, - R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20 + /* Skein_256 round rotation constants */ + R_256_0_0 = 14, + R_256_0_1 = 16, + R_256_1_0 = 52, + R_256_1_1 = 57, + R_256_2_0 = 23, + R_256_2_1 = 40, + R_256_3_0 = 5, + R_256_3_1 = 37, + R_256_4_0 = 25, + R_256_4_1 = 33, + R_256_5_0 = 46, + R_256_5_1 = 12, + R_256_6_0 = 58, + R_256_6_1 = 22, + R_256_7_0 = 32, + R_256_7_1 = 32, + + /* Skein_512 round rotation constants */ + R_512_0_0 = 46, + R_512_0_1 = 36, + R_512_0_2 = 19, + R_512_0_3 = 37, + R_512_1_0 = 33, + R_512_1_1 = 27, + R_512_1_2 = 14, + R_512_1_3 = 42, + R_512_2_0 = 17, + R_512_2_1 = 49, + R_512_2_2 = 36, + R_512_2_3 = 39, + R_512_3_0 = 44, + R_512_3_1 = 9, + R_512_3_2 = 54, + R_512_3_3 = 56, + R_512_4_0 = 39, + R_512_4_1 = 30, + R_512_4_2 = 34, + R_512_4_3 = 24, + R_512_5_0 = 13, + R_512_5_1 = 50, + R_512_5_2 = 10, + R_512_5_3 = 17, + R_512_6_0 = 25, + R_512_6_1 = 29, + R_512_6_2 = 39, + R_512_6_3 = 43, + R_512_7_0 = 8, + R_512_7_1 = 35, + R_512_7_2 = 56, + R_512_7_3 = 22, + + /* Skein1024 round rotation constants */ + R1024_0_0 = 24, + R1024_0_1 = 13, + R1024_0_2 = 8, + R1024_0_3 = 47, + R1024_0_4 = 8, + R1024_0_5 = 17, + R1024_0_6 = 22, + R1024_0_7 = 37, + R1024_1_0 = 38, + R1024_1_1 = 19, + R1024_1_2 = 10, + R1024_1_3 = 55, + R1024_1_4 = 49, + R1024_1_5 = 18, + R1024_1_6 = 23, + R1024_1_7 = 52, + R1024_2_0 = 33, + R1024_2_1 = 4, + R1024_2_2 = 51, + R1024_2_3 = 13, + R1024_2_4 = 34, + R1024_2_5 = 41, + R1024_2_6 = 59, + R1024_2_7 = 17, + R1024_3_0 = 5, + R1024_3_1 = 20, + R1024_3_2 = 48, + R1024_3_3 = 41, + R1024_3_4 = 47, + R1024_3_5 = 28, + R1024_3_6 = 16, + R1024_3_7 = 25, + R1024_4_0 = 41, + R1024_4_1 = 9, + R1024_4_2 = 37, + R1024_4_3 = 31, + R1024_4_4 = 12, + R1024_4_5 = 47, + R1024_4_6 = 44, + R1024_4_7 = 30, + R1024_5_0 = 16, + R1024_5_1 = 34, + R1024_5_2 = 56, + R1024_5_3 = 51, + R1024_5_4 = 4, + R1024_5_5 = 53, + R1024_5_6 = 42, + R1024_5_7 = 41, + R1024_6_0 = 31, + R1024_6_1 = 44, + R1024_6_2 = 47, + R1024_6_3 = 46, + R1024_6_4 = 19, + R1024_6_5 = 42, + R1024_6_6 = 44, + R1024_6_7 = 25, + R1024_7_0 = 9, + R1024_7_1 = 48, + R1024_7_2 = 35, + R1024_7_3 = 52, + R1024_7_4 = 23, + R1024_7_5 = 31, + R1024_7_6 = 37, + R1024_7_7 = 20 }; #ifndef SKEIN_ROUNDS -#define SKEIN_256_ROUNDS_TOTAL (72) /* number of rounds for the different block sizes */ +#define SKEIN_256_ROUNDS_TOTAL (72) /* number of rounds for the different block sizes */ #define SKEIN_512_ROUNDS_TOTAL (72) #define SKEIN1024_ROUNDS_TOTAL (80) -#else /* allow command-line define in range 8*(5..14) */ -#define SKEIN_256_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/100) + 5) % 10) + 5)) -#define SKEIN_512_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS/ 10) + 5) % 10) + 5)) -#define SKEIN1024_ROUNDS_TOTAL (8*((((SKEIN_ROUNDS ) + 5) % 10) + 5)) +#else /* allow command-line define in range 8*(5..14) */ +#define SKEIN_256_ROUNDS_TOTAL (8 * ((((SKEIN_ROUNDS / 100) + 5) % 10) + 5)) +#define SKEIN_512_ROUNDS_TOTAL (8 * ((((SKEIN_ROUNDS / 10) + 5) % 10) + 5)) +#define SKEIN1024_ROUNDS_TOTAL (8 * ((((SKEIN_ROUNDS) + 5) % 10) + 5)) #endif - /* ***************** Pre-computed Skein IVs ******************* ** @@ -332,239 +437,233 @@ enum /* blkSize = 256 bits. hashSize = 128 bits */ const u64b_t SKEIN_256_IV_128[] = { - MK_64(0xE1111906,0x964D7260), - MK_64(0x883DAAA7,0x7C8D811C), - MK_64(0x10080DF4,0x91960F7A), - MK_64(0xCCF7DDE5,0xB45BC1C2) - }; + MK_64(0xE1111906, 0x964D7260), + MK_64(0x883DAAA7, 0x7C8D811C), + MK_64(0x10080DF4, 0x91960F7A), + MK_64(0xCCF7DDE5, 0xB45BC1C2)}; /* blkSize = 256 bits. hashSize = 160 bits */ const u64b_t SKEIN_256_IV_160[] = { - MK_64(0x14202314,0x72825E98), - MK_64(0x2AC4E9A2,0x5A77E590), - MK_64(0xD47A5856,0x8838D63E), - MK_64(0x2DD2E496,0x8586AB7D) - }; + MK_64(0x14202314, 0x72825E98), + MK_64(0x2AC4E9A2, 0x5A77E590), + MK_64(0xD47A5856, 0x8838D63E), + MK_64(0x2DD2E496, 0x8586AB7D)}; /* blkSize = 256 bits. hashSize = 224 bits */ const u64b_t SKEIN_256_IV_224[] = { - MK_64(0xC6098A8C,0x9AE5EA0B), - MK_64(0x876D5686,0x08C5191C), - MK_64(0x99CB88D7,0xD7F53884), - MK_64(0x384BDDB1,0xAEDDB5DE) - }; + MK_64(0xC6098A8C, 0x9AE5EA0B), + MK_64(0x876D5686, 0x08C5191C), + MK_64(0x99CB88D7, 0xD7F53884), + MK_64(0x384BDDB1, 0xAEDDB5DE)}; /* blkSize = 256 bits. hashSize = 256 bits */ const u64b_t SKEIN_256_IV_256[] = { - MK_64(0xFC9DA860,0xD048B449), - MK_64(0x2FCA6647,0x9FA7D833), - MK_64(0xB33BC389,0x6656840F), - MK_64(0x6A54E920,0xFDE8DA69) - }; + MK_64(0xFC9DA860, 0xD048B449), + MK_64(0x2FCA6647, 0x9FA7D833), + MK_64(0xB33BC389, 0x6656840F), + MK_64(0x6A54E920, 0xFDE8DA69)}; /* blkSize = 512 bits. hashSize = 128 bits */ const u64b_t SKEIN_512_IV_128[] = { - MK_64(0xA8BC7BF3,0x6FBF9F52), - MK_64(0x1E9872CE,0xBD1AF0AA), - MK_64(0x309B1790,0xB32190D3), - MK_64(0xBCFBB854,0x3F94805C), - MK_64(0x0DA61BCD,0x6E31B11B), - MK_64(0x1A18EBEA,0xD46A32E3), - MK_64(0xA2CC5B18,0xCE84AA82), - MK_64(0x6982AB28,0x9D46982D) - }; + MK_64(0xA8BC7BF3, 0x6FBF9F52), + MK_64(0x1E9872CE, 0xBD1AF0AA), + MK_64(0x309B1790, 0xB32190D3), + MK_64(0xBCFBB854, 0x3F94805C), + MK_64(0x0DA61BCD, 0x6E31B11B), + MK_64(0x1A18EBEA, 0xD46A32E3), + MK_64(0xA2CC5B18, 0xCE84AA82), + MK_64(0x6982AB28, 0x9D46982D)}; /* blkSize = 512 bits. hashSize = 160 bits */ const u64b_t SKEIN_512_IV_160[] = { - MK_64(0x28B81A2A,0xE013BD91), - MK_64(0xC2F11668,0xB5BDF78F), - MK_64(0x1760D8F3,0xF6A56F12), - MK_64(0x4FB74758,0x8239904F), - MK_64(0x21EDE07F,0x7EAF5056), - MK_64(0xD908922E,0x63ED70B8), - MK_64(0xB8EC76FF,0xECCB52FA), - MK_64(0x01A47BB8,0xA3F27A6E) - }; + MK_64(0x28B81A2A, 0xE013BD91), + MK_64(0xC2F11668, 0xB5BDF78F), + MK_64(0x1760D8F3, 0xF6A56F12), + MK_64(0x4FB74758, 0x8239904F), + MK_64(0x21EDE07F, 0x7EAF5056), + MK_64(0xD908922E, 0x63ED70B8), + MK_64(0xB8EC76FF, 0xECCB52FA), + MK_64(0x01A47BB8, 0xA3F27A6E)}; /* blkSize = 512 bits. hashSize = 224 bits */ const u64b_t SKEIN_512_IV_224[] = { - MK_64(0xCCD06162,0x48677224), - MK_64(0xCBA65CF3,0xA92339EF), - MK_64(0x8CCD69D6,0x52FF4B64), - MK_64(0x398AED7B,0x3AB890B4), - MK_64(0x0F59D1B1,0x457D2BD0), - MK_64(0x6776FE65,0x75D4EB3D), - MK_64(0x99FBC70E,0x997413E9), - MK_64(0x9E2CFCCF,0xE1C41EF7) - }; + MK_64(0xCCD06162, 0x48677224), + MK_64(0xCBA65CF3, 0xA92339EF), + MK_64(0x8CCD69D6, 0x52FF4B64), + MK_64(0x398AED7B, 0x3AB890B4), + MK_64(0x0F59D1B1, 0x457D2BD0), + MK_64(0x6776FE65, 0x75D4EB3D), + MK_64(0x99FBC70E, 0x997413E9), + MK_64(0x9E2CFCCF, 0xE1C41EF7)}; /* blkSize = 512 bits. hashSize = 256 bits */ const u64b_t SKEIN_512_IV_256[] = { - MK_64(0xCCD044A1,0x2FDB3E13), - MK_64(0xE8359030,0x1A79A9EB), - MK_64(0x55AEA061,0x4F816E6F), - MK_64(0x2A2767A4,0xAE9B94DB), - MK_64(0xEC06025E,0x74DD7683), - MK_64(0xE7A436CD,0xC4746251), - MK_64(0xC36FBAF9,0x393AD185), - MK_64(0x3EEDBA18,0x33EDFC13) - }; + MK_64(0xCCD044A1, 0x2FDB3E13), + MK_64(0xE8359030, 0x1A79A9EB), + MK_64(0x55AEA061, 0x4F816E6F), + MK_64(0x2A2767A4, 0xAE9B94DB), + MK_64(0xEC06025E, 0x74DD7683), + MK_64(0xE7A436CD, 0xC4746251), + MK_64(0xC36FBAF9, 0x393AD185), + MK_64(0x3EEDBA18, 0x33EDFC13)}; /* blkSize = 512 bits. hashSize = 384 bits */ const u64b_t SKEIN_512_IV_384[] = { - MK_64(0xA3F6C6BF,0x3A75EF5F), - MK_64(0xB0FEF9CC,0xFD84FAA4), - MK_64(0x9D77DD66,0x3D770CFE), - MK_64(0xD798CBF3,0xB468FDDA), - MK_64(0x1BC4A666,0x8A0E4465), - MK_64(0x7ED7D434,0xE5807407), - MK_64(0x548FC1AC,0xD4EC44D6), - MK_64(0x266E1754,0x6AA18FF8) - }; + MK_64(0xA3F6C6BF, 0x3A75EF5F), + MK_64(0xB0FEF9CC, 0xFD84FAA4), + MK_64(0x9D77DD66, 0x3D770CFE), + MK_64(0xD798CBF3, 0xB468FDDA), + MK_64(0x1BC4A666, 0x8A0E4465), + MK_64(0x7ED7D434, 0xE5807407), + MK_64(0x548FC1AC, 0xD4EC44D6), + MK_64(0x266E1754, 0x6AA18FF8)}; /* blkSize = 512 bits. hashSize = 512 bits */ const u64b_t SKEIN_512_IV_512[] = { - MK_64(0x4903ADFF,0x749C51CE), - MK_64(0x0D95DE39,0x9746DF03), - MK_64(0x8FD19341,0x27C79BCE), - MK_64(0x9A255629,0xFF352CB1), - MK_64(0x5DB62599,0xDF6CA7B0), - MK_64(0xEABE394C,0xA9D5C3F4), - MK_64(0x991112C7,0x1A75B523), - MK_64(0xAE18A40B,0x660FCC33) - }; + MK_64(0x4903ADFF, 0x749C51CE), + MK_64(0x0D95DE39, 0x9746DF03), + MK_64(0x8FD19341, 0x27C79BCE), + MK_64(0x9A255629, 0xFF352CB1), + MK_64(0x5DB62599, 0xDF6CA7B0), + MK_64(0xEABE394C, 0xA9D5C3F4), + MK_64(0x991112C7, 0x1A75B523), + MK_64(0xAE18A40B, 0x660FCC33)}; /* blkSize = 1024 bits. hashSize = 384 bits */ const u64b_t SKEIN1024_IV_384[] = { - MK_64(0x5102B6B8,0xC1894A35), - MK_64(0xFEEBC9E3,0xFE8AF11A), - MK_64(0x0C807F06,0xE32BED71), - MK_64(0x60C13A52,0xB41A91F6), - MK_64(0x9716D35D,0xD4917C38), - MK_64(0xE780DF12,0x6FD31D3A), - MK_64(0x797846B6,0xC898303A), - MK_64(0xB172C2A8,0xB3572A3B), - MK_64(0xC9BC8203,0xA6104A6C), - MK_64(0x65909338,0xD75624F4), - MK_64(0x94BCC568,0x4B3F81A0), - MK_64(0x3EBBF51E,0x10ECFD46), - MK_64(0x2DF50F0B,0xEEB08542), - MK_64(0x3B5A6530,0x0DBC6516), - MK_64(0x484B9CD2,0x167BBCE1), - MK_64(0x2D136947,0xD4CBAFEA) - }; + MK_64(0x5102B6B8, 0xC1894A35), + MK_64(0xFEEBC9E3, 0xFE8AF11A), + MK_64(0x0C807F06, 0xE32BED71), + MK_64(0x60C13A52, 0xB41A91F6), + MK_64(0x9716D35D, 0xD4917C38), + MK_64(0xE780DF12, 0x6FD31D3A), + MK_64(0x797846B6, 0xC898303A), + MK_64(0xB172C2A8, 0xB3572A3B), + MK_64(0xC9BC8203, 0xA6104A6C), + MK_64(0x65909338, 0xD75624F4), + MK_64(0x94BCC568, 0x4B3F81A0), + MK_64(0x3EBBF51E, 0x10ECFD46), + MK_64(0x2DF50F0B, 0xEEB08542), + MK_64(0x3B5A6530, 0x0DBC6516), + MK_64(0x484B9CD2, 0x167BBCE1), + MK_64(0x2D136947, 0xD4CBAFEA)}; /* blkSize = 1024 bits. hashSize = 512 bits */ const u64b_t SKEIN1024_IV_512[] = { - MK_64(0xCAEC0E5D,0x7C1B1B18), - MK_64(0xA01B0E04,0x5F03E802), - MK_64(0x33840451,0xED912885), - MK_64(0x374AFB04,0xEAEC2E1C), - MK_64(0xDF25A0E2,0x813581F7), - MK_64(0xE4004093,0x8B12F9D2), - MK_64(0xA662D539,0xC2ED39B6), - MK_64(0xFA8B85CF,0x45D8C75A), - MK_64(0x8316ED8E,0x29EDE796), - MK_64(0x053289C0,0x2E9F91B8), - MK_64(0xC3F8EF1D,0x6D518B73), - MK_64(0xBDCEC3C4,0xD5EF332E), - MK_64(0x549A7E52,0x22974487), - MK_64(0x67070872,0x5B749816), - MK_64(0xB9CD28FB,0xF0581BD1), - MK_64(0x0E2940B8,0x15804974) - }; + MK_64(0xCAEC0E5D, 0x7C1B1B18), + MK_64(0xA01B0E04, 0x5F03E802), + MK_64(0x33840451, 0xED912885), + MK_64(0x374AFB04, 0xEAEC2E1C), + MK_64(0xDF25A0E2, 0x813581F7), + MK_64(0xE4004093, 0x8B12F9D2), + MK_64(0xA662D539, 0xC2ED39B6), + MK_64(0xFA8B85CF, 0x45D8C75A), + MK_64(0x8316ED8E, 0x29EDE796), + MK_64(0x053289C0, 0x2E9F91B8), + MK_64(0xC3F8EF1D, 0x6D518B73), + MK_64(0xBDCEC3C4, 0xD5EF332E), + MK_64(0x549A7E52, 0x22974487), + MK_64(0x67070872, 0x5B749816), + MK_64(0xB9CD28FB, 0xF0581BD1), + MK_64(0x0E2940B8, 0x15804974)}; /* blkSize = 1024 bits. hashSize = 1024 bits */ const u64b_t SKEIN1024_IV_1024[] = { - MK_64(0xD593DA07,0x41E72355), - MK_64(0x15B5E511,0xAC73E00C), - MK_64(0x5180E5AE,0xBAF2C4F0), - MK_64(0x03BD41D3,0xFCBCAFAF), - MK_64(0x1CAEC6FD,0x1983A898), - MK_64(0x6E510B8B,0xCDD0589F), - MK_64(0x77E2BDFD,0xC6394ADA), - MK_64(0xC11E1DB5,0x24DCB0A3), - MK_64(0xD6D14AF9,0xC6329AB5), - MK_64(0x6A9B0BFC,0x6EB67E0D), - MK_64(0x9243C60D,0xCCFF1332), - MK_64(0x1A1F1DDE,0x743F02D4), - MK_64(0x0996753C,0x10ED0BB8), - MK_64(0x6572DD22,0xF2B4969A), - MK_64(0x61FD3062,0xD00A579A), - MK_64(0x1DE0536E,0x8682E539) - }; - + MK_64(0xD593DA07, 0x41E72355), + MK_64(0x15B5E511, 0xAC73E00C), + MK_64(0x5180E5AE, 0xBAF2C4F0), + MK_64(0x03BD41D3, 0xFCBCAFAF), + MK_64(0x1CAEC6FD, 0x1983A898), + MK_64(0x6E510B8B, 0xCDD0589F), + MK_64(0x77E2BDFD, 0xC6394ADA), + MK_64(0xC11E1DB5, 0x24DCB0A3), + MK_64(0xD6D14AF9, 0xC6329AB5), + MK_64(0x6A9B0BFC, 0x6EB67E0D), + MK_64(0x9243C60D, 0xCCFF1332), + MK_64(0x1A1F1DDE, 0x743F02D4), + MK_64(0x0996753C, 0x10ED0BB8), + MK_64(0x6572DD22, 0xF2B4969A), + MK_64(0x61FD3062, 0xD00A579A), + MK_64(0x1DE0536E, 0x8682E539)}; #ifndef SKEIN_USE_ASM -#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */ +#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */ #endif #ifndef SKEIN_LOOP -#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */ +#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */ #endif -#define BLK_BITS (WCNT*64) /* some useful definitions for code here */ -#define KW_TWK_BASE (0) -#define KW_KEY_BASE (3) -#define ks (kw + KW_KEY_BASE) -#define ts (kw + KW_TWK_BASE) +#define BLK_BITS (WCNT * 64) /* some useful definitions for code here */ +#define KW_TWK_BASE (0) +#define KW_KEY_BASE (3) +#define ks (kw + KW_KEY_BASE) +#define ts (kw + KW_TWK_BASE) #ifdef SKEIN_DEBUG -#define DebugSaveTweak(ctx) { ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; } +#define DebugSaveTweak(ctx) \ + { \ + ctx->h.T[0] = ts[0]; \ + ctx->h.T[1] = ts[1]; \ + } #else #define DebugSaveTweak(ctx) #endif /***************************** Skein_256 ******************************/ #if !(SKEIN_USE_ASM & 256) -static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) - { /* do it in C */ +static void Skein_256_Process_Block(Skein_256_Ctxt_t* ctx, const u08b_t* blkPtr, size_t blkCnt, size_t byteCntAdd) +{ /* do it in C */ enum - { + { WCNT = SKEIN_256_STATE_WORDS - }; -#undef RCNT -#define RCNT (SKEIN_256_ROUNDS_TOTAL/8) + }; +#undef RCNT +#define RCNT (SKEIN_256_ROUNDS_TOTAL / 8) -#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ -#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10) +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10) #else #define SKEIN_UNROLL_256 (0) #endif #if SKEIN_UNROLL_256 -#if (RCNT % SKEIN_UNROLL_256) +#if(RCNT % SKEIN_UNROLL_256) #error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */ #endif - size_t r; - u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/ + size_t r; + u64b_t kw[WCNT + 4 + RCNT * 2]; /* key schedule words : chaining vars + tweak + "rotation"*/ #else - u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ + u64b_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ #endif - u64b_t X0,X1,X2,X3; /* local copy of context vars, for speed */ - u64b_t w [WCNT]; /* local copy of input block */ + u64b_t X0, X1, X2, X3; /* local copy of context vars, for speed */ + u64b_t w[WCNT]; /* local copy of input block */ #ifdef SKEIN_DEBUG - const u64b_t *Xptr[4]; /* use for debugging (help compiler put Xn in registers) */ - Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3; + const u64b_t* Xptr[4]; /* use for debugging (help compiler put Xn in registers) */ + Xptr[0] = &X0; + Xptr[1] = &X1; + Xptr[2] = &X2; + Xptr[3] = &X3; #endif - Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ ts[0] = ctx->h.T[0]; ts[1] = ctx->h.T[1]; - do { + do + { /* this implementation only supports 2**64 input bytes (no carry out here) */ - ts[0] += byteCntAdd; /* update processed length */ + ts[0] += byteCntAdd; /* update processed length */ /* precompute the key schedule for this block */ ks[0] = ctx->X[0]; @@ -575,114 +674,118 @@ static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,s ts[2] = ts[0] ^ ts[1]; - Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */ DebugSaveTweak(ctx); - Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts); + Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); - X0 = w[0] + ks[0]; /* do the first full key injection */ + X0 = w[0] + ks[0]; /* do the first full key injection */ X1 = w[1] + ks[1] + ts[0]; X2 = w[2] + ks[2] + ts[1]; X3 = w[3] + ks[3]; - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr); /* show starting state values */ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr); /* show starting state values */ blkPtr += SKEIN_256_BLOCK_BYTES; /* run the rounds */ -#define Round256(p0,p1,p2,p3,ROT,rNum) \ - X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ - X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ +#define Round256(p0, p1, p2, p3, ROT, rNum) \ + X##p0 += X##p1; \ + X##p1 = RotL_64(X##p1, ROT##_0); \ + X##p1 ^= X##p0; \ + X##p2 += X##p3; \ + X##p3 = RotL_64(X##p3, ROT##_1); \ + X##p3 ^= X##p2; #if SKEIN_UNROLL_256 == 0 -#define R256(p0,p1,p2,p3,ROT,rNum) /* fully unrolled */ \ - Round256(p0,p1,p2,p3,ROT,rNum) \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr); - -#define I256(R) \ - X0 += ks[((R)+1) % 5]; /* inject the key schedule value */ \ - X1 += ks[((R)+2) % 5] + ts[((R)+1) % 3]; \ - X2 += ks[((R)+3) % 5] + ts[((R)+2) % 3]; \ - X3 += ks[((R)+4) % 5] + (R)+1; \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); -#else /* looping version */ -#define R256(p0,p1,p2,p3,ROT,rNum) \ - Round256(p0,p1,p2,p3,ROT,rNum) \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr); - -#define I256(R) \ - X0 += ks[r+(R)+0]; /* inject the key schedule value */ \ - X1 += ks[r+(R)+1] + ts[r+(R)+0]; \ - X2 += ks[r+(R)+2] + ts[r+(R)+1]; \ - X3 += ks[r+(R)+3] + r+(R) ; \ - ks[r + (R)+4 ] = ks[r+(R)-1]; /* rotate key schedule */\ - ts[r + (R)+2 ] = ts[r+(R)-1]; \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); - - for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_256) /* loop thru it */ +#define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \ + Round256(p0, p1, p2, p3, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); + +#define I256(R) \ + X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \ + X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3]; \ + X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3]; \ + X3 += ks[((R) + 4) % 5] + (R) + 1; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); +#else /* looping version */ +#define R256(p0, p1, p2, p3, ROT, rNum) \ + Round256(p0, p1, p2, p3, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); + +#define I256(R) \ + X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ + X1 += ks[r + (R) + 1] + ts[r + (R) + 0]; \ + X2 += ks[r + (R) + 2] + ts[r + (R) + 1]; \ + X3 += ks[r + (R) + 3] + r + (R); \ + ks[r + (R) + 4] = ks[r + (R)-1]; /* rotate key schedule */ \ + ts[r + (R) + 2] = ts[r + (R)-1]; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + + for(r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256) /* loop thru it */ #endif { -#define R256_8_rounds(R) \ - R256(0,1,2,3,R_256_0,8*(R) + 1); \ - R256(0,3,2,1,R_256_1,8*(R) + 2); \ - R256(0,1,2,3,R_256_2,8*(R) + 3); \ - R256(0,3,2,1,R_256_3,8*(R) + 4); \ - I256(2*(R)); \ - R256(0,1,2,3,R_256_4,8*(R) + 5); \ - R256(0,3,2,1,R_256_5,8*(R) + 6); \ - R256(0,1,2,3,R_256_6,8*(R) + 7); \ - R256(0,3,2,1,R_256_7,8*(R) + 8); \ - I256(2*(R)+1); - - R256_8_rounds( 0); - -#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN))) - - #if R256_Unroll_R( 1) - R256_8_rounds( 1); - #endif - #if R256_Unroll_R( 2) - R256_8_rounds( 2); - #endif - #if R256_Unroll_R( 3) - R256_8_rounds( 3); - #endif - #if R256_Unroll_R( 4) - R256_8_rounds( 4); - #endif - #if R256_Unroll_R( 5) - R256_8_rounds( 5); - #endif - #if R256_Unroll_R( 6) - R256_8_rounds( 6); - #endif - #if R256_Unroll_R( 7) - R256_8_rounds( 7); - #endif - #if R256_Unroll_R( 8) - R256_8_rounds( 8); - #endif - #if R256_Unroll_R( 9) - R256_8_rounds( 9); - #endif - #if R256_Unroll_R(10) - R256_8_rounds(10); - #endif - #if R256_Unroll_R(11) - R256_8_rounds(11); - #endif - #if R256_Unroll_R(12) - R256_8_rounds(12); - #endif - #if R256_Unroll_R(13) - R256_8_rounds(13); - #endif - #if R256_Unroll_R(14) - R256_8_rounds(14); - #endif - #if (SKEIN_UNROLL_256 > 14) -#error "need more unrolling in Skein_256_Process_Block" - #endif +#define R256_8_rounds(R) \ + R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \ + R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \ + R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \ + R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \ + I256(2 * (R)); \ + R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \ + R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \ + R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \ + R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \ + I256(2 * (R) + 1); + + R256_8_rounds(0); + +#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || (SKEIN_UNROLL_256 > (NN))) + +#if R256_Unroll_R(1) + R256_8_rounds(1); +#endif +#if R256_Unroll_R(2) + R256_8_rounds(2); +#endif +#if R256_Unroll_R(3) + R256_8_rounds(3); +#endif +#if R256_Unroll_R(4) + R256_8_rounds(4); +#endif +#if R256_Unroll_R(5) + R256_8_rounds(5); +#endif +#if R256_Unroll_R(6) + R256_8_rounds(6); +#endif +#if R256_Unroll_R(7) + R256_8_rounds(7); +#endif +#if R256_Unroll_R(8) + R256_8_rounds(8); +#endif +#if R256_Unroll_R(9) + R256_8_rounds(9); +#endif +#if R256_Unroll_R(10) + R256_8_rounds(10); +#endif +#if R256_Unroll_R(11) + R256_8_rounds(11); +#endif +#if R256_Unroll_R(12) + R256_8_rounds(12); +#endif +#if R256_Unroll_R(13) + R256_8_rounds(13); +#endif +#if R256_Unroll_R(14) + R256_8_rounds(14); +#endif +#if(SKEIN_UNROLL_256 > 14) +#error "need more unrolling in Skein_256_Process_Block" +#endif } /* do the final "feedforward" xor, update context chaining vars */ ctx->X[0] = X0 ^ w[0]; @@ -690,68 +793,74 @@ static void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx,const u08b_t *blkPtr,s ctx->X[2] = X2 ^ w[2]; ctx->X[3] = X3 ^ w[3]; - Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X); + Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); ts[1] &= ~SKEIN_T1_FLAG_FIRST; - } - while (--blkCnt); + } while(--blkCnt); ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; - } +} #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) static size_t Skein_256_Process_Block_CodeSize(void) - { - return ((u08b_t *) Skein_256_Process_Block_CodeSize) - - ((u08b_t *) Skein_256_Process_Block); - } +{ + return ((u08b_t*)Skein_256_Process_Block_CodeSize) - + ((u08b_t*)Skein_256_Process_Block); +} static uint_t Skein_256_Unroll_Cnt(void) - { +{ return SKEIN_UNROLL_256; - } +} #endif #endif /***************************** Skein_512 ******************************/ #if !(SKEIN_USE_ASM & 512) -static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) - { /* do it in C */ +static void Skein_512_Process_Block(Skein_512_Ctxt_t* ctx, const u08b_t* blkPtr, size_t blkCnt, size_t byteCntAdd) +{ /* do it in C */ enum - { + { WCNT = SKEIN_512_STATE_WORDS - }; -#undef RCNT -#define RCNT (SKEIN_512_ROUNDS_TOTAL/8) + }; +#undef RCNT +#define RCNT (SKEIN_512_ROUNDS_TOTAL / 8) -#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ -#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10) +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10) #else #define SKEIN_UNROLL_512 (0) #endif #if SKEIN_UNROLL_512 -#if (RCNT % SKEIN_UNROLL_512) +#if(RCNT % SKEIN_UNROLL_512) #error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */ #endif - size_t r; - u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/ + size_t r; + u64b_t kw[WCNT + 4 + RCNT * 2]; /* key schedule words : chaining vars + tweak + "rotation"*/ #else - u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ + u64b_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ #endif - u64b_t X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */ - u64b_t w [WCNT]; /* local copy of input block */ + u64b_t X0, X1, X2, X3, X4, X5, X6, X7; /* local copy of vars, for speed */ + u64b_t w[WCNT]; /* local copy of input block */ #ifdef SKEIN_DEBUG - const u64b_t *Xptr[8]; /* use for debugging (help compiler put Xn in registers) */ - Xptr[0] = &X0; Xptr[1] = &X1; Xptr[2] = &X2; Xptr[3] = &X3; - Xptr[4] = &X4; Xptr[5] = &X5; Xptr[6] = &X6; Xptr[7] = &X7; + const u64b_t* Xptr[8]; /* use for debugging (help compiler put Xn in registers) */ + Xptr[0] = &X0; + Xptr[1] = &X1; + Xptr[2] = &X2; + Xptr[3] = &X3; + Xptr[4] = &X4; + Xptr[5] = &X5; + Xptr[6] = &X6; + Xptr[7] = &X7; #endif - Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ ts[0] = ctx->h.T[0]; ts[1] = ctx->h.T[1]; - do { + do + { /* this implementation only supports 2**64 input bytes (no carry out here) */ - ts[0] += byteCntAdd; /* update processed length */ + ts[0] += byteCntAdd; /* update processed length */ /* precompute the key schedule for this block */ ks[0] = ctx->X[0]; @@ -767,126 +876,134 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s ts[2] = ts[0] ^ ts[1]; - Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */ DebugSaveTweak(ctx); - Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts); + Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); - X0 = w[0] + ks[0]; /* do the first full key injection */ - X1 = w[1] + ks[1]; - X2 = w[2] + ks[2]; - X3 = w[3] + ks[3]; - X4 = w[4] + ks[4]; - X5 = w[5] + ks[5] + ts[0]; - X6 = w[6] + ks[6] + ts[1]; - X7 = w[7] + ks[7]; + X0 = w[0] + ks[0]; /* do the first full key injection */ + X1 = w[1] + ks[1]; + X2 = w[2] + ks[2]; + X3 = w[3] + ks[3]; + X4 = w[4] + ks[4]; + X5 = w[5] + ks[5] + ts[0]; + X6 = w[6] + ks[6] + ts[1]; + X7 = w[7] + ks[7]; blkPtr += SKEIN_512_BLOCK_BYTES; - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr); + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr); /* run the rounds */ -#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ - X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ - X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ - X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \ - X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \ +#define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + X##p0 += X##p1; \ + X##p1 = RotL_64(X##p1, ROT##_0); \ + X##p1 ^= X##p0; \ + X##p2 += X##p3; \ + X##p3 = RotL_64(X##p3, ROT##_1); \ + X##p3 ^= X##p2; \ + X##p4 += X##p5; \ + X##p5 = RotL_64(X##p5, ROT##_2); \ + X##p5 ^= X##p4; \ + X##p6 += X##p7; \ + X##p7 = RotL_64(X##p7, ROT##_3); \ + X##p7 ^= X##p6; #if SKEIN_UNROLL_512 == 0 -#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) /* unrolled */ \ - Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr); - -#define I512(R) \ - X0 += ks[((R)+1) % 9]; /* inject the key schedule value */ \ - X1 += ks[((R)+2) % 9]; \ - X2 += ks[((R)+3) % 9]; \ - X3 += ks[((R)+4) % 9]; \ - X4 += ks[((R)+5) % 9]; \ - X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \ - X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \ - X7 += ks[((R)+8) % 9] + (R)+1; \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); -#else /* looping version */ -#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ - Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr); - -#define I512(R) \ - X0 += ks[r+(R)+0]; /* inject the key schedule value */ \ - X1 += ks[r+(R)+1]; \ - X2 += ks[r+(R)+2]; \ - X3 += ks[r+(R)+3]; \ - X4 += ks[r+(R)+4]; \ - X5 += ks[r+(R)+5] + ts[r+(R)+0]; \ - X6 += ks[r+(R)+6] + ts[r+(R)+1]; \ - X7 += ks[r+(R)+7] + r+(R) ; \ - ks[r + (R)+8] = ks[r+(R)-1]; /* rotate key schedule */ \ - ts[r + (R)+2] = ts[r+(R)-1]; \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); - - for (r=1;r < 2*RCNT;r+=2*SKEIN_UNROLL_512) /* loop thru it */ -#endif /* end of looped code definitions */ +#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \ + Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); + +#define I512(R) \ + X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */ \ + X1 += ks[((R) + 2) % 9]; \ + X2 += ks[((R) + 3) % 9]; \ + X3 += ks[((R) + 4) % 9]; \ + X4 += ks[((R) + 5) % 9]; \ + X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \ + X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \ + X7 += ks[((R) + 8) % 9] + (R) + 1; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); +#else /* looping version */ +#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); + +#define I512(R) \ + X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ + X1 += ks[r + (R) + 1]; \ + X2 += ks[r + (R) + 2]; \ + X3 += ks[r + (R) + 3]; \ + X4 += ks[r + (R) + 4]; \ + X5 += ks[r + (R) + 5] + ts[r + (R) + 0]; \ + X6 += ks[r + (R) + 6] + ts[r + (R) + 1]; \ + X7 += ks[r + (R) + 7] + r + (R); \ + ks[r + (R) + 8] = ks[r + (R)-1]; /* rotate key schedule */ \ + ts[r + (R) + 2] = ts[r + (R)-1]; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + + for(r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512) /* loop thru it */ +#endif /* end of looped code definitions */ { -#define R512_8_rounds(R) /* do 8 full rounds */ \ - R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ - R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ - R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ - R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ - I512(2*(R)); \ - R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ - R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ - R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ - R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ - I512(2*(R)+1); /* and key injection */ - - R512_8_rounds( 0); - -#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN))) - - #if R512_Unroll_R( 1) - R512_8_rounds( 1); - #endif - #if R512_Unroll_R( 2) - R512_8_rounds( 2); - #endif - #if R512_Unroll_R( 3) - R512_8_rounds( 3); - #endif - #if R512_Unroll_R( 4) - R512_8_rounds( 4); - #endif - #if R512_Unroll_R( 5) - R512_8_rounds( 5); - #endif - #if R512_Unroll_R( 6) - R512_8_rounds( 6); - #endif - #if R512_Unroll_R( 7) - R512_8_rounds( 7); - #endif - #if R512_Unroll_R( 8) - R512_8_rounds( 8); - #endif - #if R512_Unroll_R( 9) - R512_8_rounds( 9); - #endif - #if R512_Unroll_R(10) - R512_8_rounds(10); - #endif - #if R512_Unroll_R(11) - R512_8_rounds(11); - #endif - #if R512_Unroll_R(12) - R512_8_rounds(12); - #endif - #if R512_Unroll_R(13) - R512_8_rounds(13); - #endif - #if R512_Unroll_R(14) - R512_8_rounds(14); - #endif - #if (SKEIN_UNROLL_512 > 14) -#error "need more unrolling in Skein_512_Process_Block" - #endif +#define R512_8_rounds(R) /* do 8 full rounds */ \ + R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \ + R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \ + R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \ + R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \ + I512(2 * (R)); \ + R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \ + R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \ + R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \ + R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \ + I512(2 * (R) + 1); /* and key injection */ + + R512_8_rounds(0); + +#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || (SKEIN_UNROLL_512 > (NN))) + +#if R512_Unroll_R(1) + R512_8_rounds(1); +#endif +#if R512_Unroll_R(2) + R512_8_rounds(2); +#endif +#if R512_Unroll_R(3) + R512_8_rounds(3); +#endif +#if R512_Unroll_R(4) + R512_8_rounds(4); +#endif +#if R512_Unroll_R(5) + R512_8_rounds(5); +#endif +#if R512_Unroll_R(6) + R512_8_rounds(6); +#endif +#if R512_Unroll_R(7) + R512_8_rounds(7); +#endif +#if R512_Unroll_R(8) + R512_8_rounds(8); +#endif +#if R512_Unroll_R(9) + R512_8_rounds(9); +#endif +#if R512_Unroll_R(10) + R512_8_rounds(10); +#endif +#if R512_Unroll_R(11) + R512_8_rounds(11); +#endif +#if R512_Unroll_R(12) + R512_8_rounds(12); +#endif +#if R512_Unroll_R(13) + R512_8_rounds(13); +#endif +#if R512_Unroll_R(14) + R512_8_rounds(14); +#endif +#if(SKEIN_UNROLL_512 > 14) +#error "need more unrolling in Skein_512_Process_Block" +#endif } /* do the final "feedforward" xor, update context chaining vars */ @@ -898,256 +1015,284 @@ static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,s ctx->X[5] = X5 ^ w[5]; ctx->X[6] = X6 ^ w[6]; ctx->X[7] = X7 ^ w[7]; - Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X); + Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); ts[1] &= ~SKEIN_T1_FLAG_FIRST; - } - while (--blkCnt); + } while(--blkCnt); ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; - } +} #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) static size_t Skein_512_Process_Block_CodeSize(void) - { - return ((u08b_t *) Skein_512_Process_Block_CodeSize) - - ((u08b_t *) Skein_512_Process_Block); - } +{ + return ((u08b_t*)Skein_512_Process_Block_CodeSize) - + ((u08b_t*)Skein_512_Process_Block); +} static uint_t Skein_512_Unroll_Cnt(void) - { +{ return SKEIN_UNROLL_512; - } +} #endif #endif /***************************** Skein1024 ******************************/ #if !(SKEIN_USE_ASM & 1024) -static void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd) - { /* do it in C, always looping (unrolled is bigger AND slower!) */ +static void Skein1024_Process_Block(Skein1024_Ctxt_t* ctx, const u08b_t* blkPtr, size_t blkCnt, size_t byteCntAdd) +{ /* do it in C, always looping (unrolled is bigger AND slower!) */ enum - { + { WCNT = SKEIN1024_STATE_WORDS - }; -#undef RCNT -#define RCNT (SKEIN1024_ROUNDS_TOTAL/8) + }; +#undef RCNT +#define RCNT (SKEIN1024_ROUNDS_TOTAL / 8) -#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ -#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10) +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_1024 ((SKEIN_LOOP) % 10) #else #define SKEIN_UNROLL_1024 (0) #endif -#if (SKEIN_UNROLL_1024 != 0) -#if (RCNT % SKEIN_UNROLL_1024) +#if(SKEIN_UNROLL_1024 != 0) +#if(RCNT % SKEIN_UNROLL_1024) #error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */ #endif - size_t r; - u64b_t kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/ + size_t r; + u64b_t kw[WCNT + 4 + RCNT * 2]; /* key schedule words : chaining vars + tweak + "rotation"*/ #else - u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ + u64b_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ #endif - u64b_t X00,X01,X02,X03,X04,X05,X06,X07, /* local copy of vars, for speed */ - X08,X09,X10,X11,X12,X13,X14,X15; - u64b_t w [WCNT]; /* local copy of input block */ + u64b_t X00, X01, X02, X03, X04, X05, X06, X07, /* local copy of vars, for speed */ + X08, X09, X10, X11, X12, X13, X14, X15; + u64b_t w[WCNT]; /* local copy of input block */ #ifdef SKEIN_DEBUG - const u64b_t *Xptr[16]; /* use for debugging (help compiler put Xn in registers) */ - Xptr[ 0] = &X00; Xptr[ 1] = &X01; Xptr[ 2] = &X02; Xptr[ 3] = &X03; - Xptr[ 4] = &X04; Xptr[ 5] = &X05; Xptr[ 6] = &X06; Xptr[ 7] = &X07; - Xptr[ 8] = &X08; Xptr[ 9] = &X09; Xptr[10] = &X10; Xptr[11] = &X11; - Xptr[12] = &X12; Xptr[13] = &X13; Xptr[14] = &X14; Xptr[15] = &X15; + const u64b_t* Xptr[16]; /* use for debugging (help compiler put Xn in registers) */ + Xptr[0] = &X00; + Xptr[1] = &X01; + Xptr[2] = &X02; + Xptr[3] = &X03; + Xptr[4] = &X04; + Xptr[5] = &X05; + Xptr[6] = &X06; + Xptr[7] = &X07; + Xptr[8] = &X08; + Xptr[9] = &X09; + Xptr[10] = &X10; + Xptr[11] = &X11; + Xptr[12] = &X12; + Xptr[13] = &X13; + Xptr[14] = &X14; + Xptr[15] = &X15; #endif - Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ ts[0] = ctx->h.T[0]; ts[1] = ctx->h.T[1]; - do { + do + { /* this implementation only supports 2**64 input bytes (no carry out here) */ - ts[0] += byteCntAdd; /* update processed length */ + ts[0] += byteCntAdd; /* update processed length */ /* precompute the key schedule for this block */ - ks[ 0] = ctx->X[ 0]; - ks[ 1] = ctx->X[ 1]; - ks[ 2] = ctx->X[ 2]; - ks[ 3] = ctx->X[ 3]; - ks[ 4] = ctx->X[ 4]; - ks[ 5] = ctx->X[ 5]; - ks[ 6] = ctx->X[ 6]; - ks[ 7] = ctx->X[ 7]; - ks[ 8] = ctx->X[ 8]; - ks[ 9] = ctx->X[ 9]; + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ctx->X[4]; + ks[5] = ctx->X[5]; + ks[6] = ctx->X[6]; + ks[7] = ctx->X[7]; + ks[8] = ctx->X[8]; + ks[9] = ctx->X[9]; ks[10] = ctx->X[10]; ks[11] = ctx->X[11]; ks[12] = ctx->X[12]; ks[13] = ctx->X[13]; ks[14] = ctx->X[14]; ks[15] = ctx->X[15]; - ks[16] = ks[ 0] ^ ks[ 1] ^ ks[ 2] ^ ks[ 3] ^ - ks[ 4] ^ ks[ 5] ^ ks[ 6] ^ ks[ 7] ^ - ks[ 8] ^ ks[ 9] ^ ks[10] ^ ks[11] ^ + ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ + ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^ ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY; - ts[2] = ts[0] ^ ts[1]; + ts[2] = ts[0] ^ ts[1]; - Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */ DebugSaveTweak(ctx); - Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts); - - X00 = w[ 0] + ks[ 0]; /* do the first full key injection */ - X01 = w[ 1] + ks[ 1]; - X02 = w[ 2] + ks[ 2]; - X03 = w[ 3] + ks[ 3]; - X04 = w[ 4] + ks[ 4]; - X05 = w[ 5] + ks[ 5]; - X06 = w[ 6] + ks[ 6]; - X07 = w[ 7] + ks[ 7]; - X08 = w[ 8] + ks[ 8]; - X09 = w[ 9] + ks[ 9]; - X10 = w[10] + ks[10]; - X11 = w[11] + ks[11]; - X12 = w[12] + ks[12]; - X13 = w[13] + ks[13] + ts[0]; - X14 = w[14] + ks[14] + ts[1]; - X15 = w[15] + ks[15]; - - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr); - -#define Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rNum) \ - X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ - X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ - X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \ - X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \ - X##p8 += X##p9; X##p9 = RotL_64(X##p9,ROT##_4); X##p9 ^= X##p8; \ - X##pA += X##pB; X##pB = RotL_64(X##pB,ROT##_5); X##pB ^= X##pA; \ - X##pC += X##pD; X##pD = RotL_64(X##pD,ROT##_6); X##pD ^= X##pC; \ - X##pE += X##pF; X##pF = RotL_64(X##pF,ROT##_7); X##pF ^= X##pE; \ + Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); + + X00 = w[0] + ks[0]; /* do the first full key injection */ + X01 = w[1] + ks[1]; + X02 = w[2] + ks[2]; + X03 = w[3] + ks[3]; + X04 = w[4] + ks[4]; + X05 = w[5] + ks[5]; + X06 = w[6] + ks[6]; + X07 = w[7] + ks[7]; + X08 = w[8] + ks[8]; + X09 = w[9] + ks[9]; + X10 = w[10] + ks[10]; + X11 = w[11] + ks[11]; + X12 = w[12] + ks[12]; + X13 = w[13] + ks[13] + ts[0]; + X14 = w[14] + ks[14] + ts[1]; + X15 = w[15] + ks[15]; + + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, Xptr); + +#define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rNum) \ + X##p0 += X##p1; \ + X##p1 = RotL_64(X##p1, ROT##_0); \ + X##p1 ^= X##p0; \ + X##p2 += X##p3; \ + X##p3 = RotL_64(X##p3, ROT##_1); \ + X##p3 ^= X##p2; \ + X##p4 += X##p5; \ + X##p5 = RotL_64(X##p5, ROT##_2); \ + X##p5 ^= X##p4; \ + X##p6 += X##p7; \ + X##p7 = RotL_64(X##p7, ROT##_3); \ + X##p7 ^= X##p6; \ + X##p8 += X##p9; \ + X##p9 = RotL_64(X##p9, ROT##_4); \ + X##p9 ^= X##p8; \ + X##pA += X##pB; \ + X##pB = RotL_64(X##pB, ROT##_5); \ + X##pB ^= X##pA; \ + X##pC += X##pD; \ + X##pD = RotL_64(X##pD, ROT##_6); \ + X##pD ^= X##pC; \ + X##pE += X##pF; \ + X##pF = RotL_64(X##pF, ROT##_7); \ + X##pF ^= X##pE; #if SKEIN_UNROLL_1024 == 0 -#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ - Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rn,Xptr); - -#define I1024(R) \ - X00 += ks[((R)+ 1) % 17]; /* inject the key schedule value */ \ - X01 += ks[((R)+ 2) % 17]; \ - X02 += ks[((R)+ 3) % 17]; \ - X03 += ks[((R)+ 4) % 17]; \ - X04 += ks[((R)+ 5) % 17]; \ - X05 += ks[((R)+ 6) % 17]; \ - X06 += ks[((R)+ 7) % 17]; \ - X07 += ks[((R)+ 8) % 17]; \ - X08 += ks[((R)+ 9) % 17]; \ - X09 += ks[((R)+10) % 17]; \ - X10 += ks[((R)+11) % 17]; \ - X11 += ks[((R)+12) % 17]; \ - X12 += ks[((R)+13) % 17]; \ - X13 += ks[((R)+14) % 17] + ts[((R)+1) % 3]; \ - X14 += ks[((R)+15) % 17] + ts[((R)+2) % 3]; \ - X15 += ks[((R)+16) % 17] + (R)+1; \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); -#else /* looping version */ -#define R1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ - Round1024(p0,p1,p2,p3,p4,p5,p6,p7,p8,p9,pA,pB,pC,pD,pE,pF,ROT,rn) \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rn,Xptr); - -#define I1024(R) \ - X00 += ks[r+(R)+ 0]; /* inject the key schedule value */ \ - X01 += ks[r+(R)+ 1]; \ - X02 += ks[r+(R)+ 2]; \ - X03 += ks[r+(R)+ 3]; \ - X04 += ks[r+(R)+ 4]; \ - X05 += ks[r+(R)+ 5]; \ - X06 += ks[r+(R)+ 6]; \ - X07 += ks[r+(R)+ 7]; \ - X08 += ks[r+(R)+ 8]; \ - X09 += ks[r+(R)+ 9]; \ - X10 += ks[r+(R)+10]; \ - X11 += ks[r+(R)+11]; \ - X12 += ks[r+(R)+12]; \ - X13 += ks[r+(R)+13] + ts[r+(R)+0]; \ - X14 += ks[r+(R)+14] + ts[r+(R)+1]; \ - X15 += ks[r+(R)+15] + r+(R) ; \ - ks[r + (R)+16] = ks[r+(R)-1]; /* rotate key schedule */ \ - ts[r + (R)+ 2] = ts[r+(R)-1]; \ - Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr); - - for (r=1;r <= 2*RCNT;r+=2*SKEIN_UNROLL_1024) /* loop thru it */ +#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \ + Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr); + +#define I1024(R) \ + X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */ \ + X01 += ks[((R) + 2) % 17]; \ + X02 += ks[((R) + 3) % 17]; \ + X03 += ks[((R) + 4) % 17]; \ + X04 += ks[((R) + 5) % 17]; \ + X05 += ks[((R) + 6) % 17]; \ + X06 += ks[((R) + 7) % 17]; \ + X07 += ks[((R) + 8) % 17]; \ + X08 += ks[((R) + 9) % 17]; \ + X09 += ks[((R) + 10) % 17]; \ + X10 += ks[((R) + 11) % 17]; \ + X11 += ks[((R) + 12) % 17]; \ + X12 += ks[((R) + 13) % 17]; \ + X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \ + X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \ + X15 += ks[((R) + 16) % 17] + (R) + 1; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); +#else /* looping version */ +#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \ + Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, ROT, rn) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr); + +#define I1024(R) \ + X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \ + X01 += ks[r + (R) + 1]; \ + X02 += ks[r + (R) + 2]; \ + X03 += ks[r + (R) + 3]; \ + X04 += ks[r + (R) + 4]; \ + X05 += ks[r + (R) + 5]; \ + X06 += ks[r + (R) + 6]; \ + X07 += ks[r + (R) + 7]; \ + X08 += ks[r + (R) + 8]; \ + X09 += ks[r + (R) + 9]; \ + X10 += ks[r + (R) + 10]; \ + X11 += ks[r + (R) + 11]; \ + X12 += ks[r + (R) + 12]; \ + X13 += ks[r + (R) + 13] + ts[r + (R) + 0]; \ + X14 += ks[r + (R) + 14] + ts[r + (R) + 1]; \ + X15 += ks[r + (R) + 15] + r + (R); \ + ks[r + (R) + 16] = ks[r + (R)-1]; /* rotate key schedule */ \ + ts[r + (R) + 2] = ts[r + (R)-1]; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + + for(r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024) /* loop thru it */ #endif { -#define R1024_8_rounds(R) /* do 8 full rounds */ \ - R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_0,8*(R) + 1); \ - R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_1,8*(R) + 2); \ - R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_2,8*(R) + 3); \ - R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_3,8*(R) + 4); \ - I1024(2*(R)); \ - R1024(00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,R1024_4,8*(R) + 5); \ - R1024(00,09,02,13,06,11,04,15,10,07,12,03,14,05,08,01,R1024_5,8*(R) + 6); \ - R1024(00,07,02,05,04,03,06,01,12,15,14,13,08,11,10,09,R1024_6,8*(R) + 7); \ - R1024(00,15,02,11,06,13,04,09,14,01,08,05,10,03,12,07,R1024_7,8*(R) + 8); \ - I1024(2*(R)+1); - - R1024_8_rounds( 0); - -#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_1024 > (NN))) - - #if R1024_Unroll_R( 1) - R1024_8_rounds( 1); - #endif - #if R1024_Unroll_R( 2) - R1024_8_rounds( 2); - #endif - #if R1024_Unroll_R( 3) - R1024_8_rounds( 3); - #endif - #if R1024_Unroll_R( 4) - R1024_8_rounds( 4); - #endif - #if R1024_Unroll_R( 5) - R1024_8_rounds( 5); - #endif - #if R1024_Unroll_R( 6) - R1024_8_rounds( 6); - #endif - #if R1024_Unroll_R( 7) - R1024_8_rounds( 7); - #endif - #if R1024_Unroll_R( 8) - R1024_8_rounds( 8); - #endif - #if R1024_Unroll_R( 9) - R1024_8_rounds( 9); - #endif - #if R1024_Unroll_R(10) - R1024_8_rounds(10); - #endif - #if R1024_Unroll_R(11) - R1024_8_rounds(11); - #endif - #if R1024_Unroll_R(12) - R1024_8_rounds(12); - #endif - #if R1024_Unroll_R(13) - R1024_8_rounds(13); - #endif - #if R1024_Unroll_R(14) - R1024_8_rounds(14); - #endif - #if (SKEIN_UNROLL_1024 > 14) -#error "need more unrolling in Skein_1024_Process_Block" - #endif +#define R1024_8_rounds(R) /* do 8 full rounds */ \ + R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, R1024_0, 8 * (R) + 1); \ + R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, 08, 01, R1024_1, 8 * (R) + 2); \ + R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, 10, 09, R1024_2, 8 * (R) + 3); \ + R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, 12, 07, R1024_3, 8 * (R) + 4); \ + I1024(2 * (R)); \ + R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, 14, 15, R1024_4, 8 * (R) + 5); \ + R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, 08, 01, R1024_5, 8 * (R) + 6); \ + R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, 10, 09, R1024_6, 8 * (R) + 7); \ + R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, 12, 07, R1024_7, 8 * (R) + 8); \ + I1024(2 * (R) + 1); + + R1024_8_rounds(0); + +#define R1024_Unroll_R(NN) ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL / 8 > (NN)) || (SKEIN_UNROLL_1024 > (NN))) + +#if R1024_Unroll_R(1) + R1024_8_rounds(1); +#endif +#if R1024_Unroll_R(2) + R1024_8_rounds(2); +#endif +#if R1024_Unroll_R(3) + R1024_8_rounds(3); +#endif +#if R1024_Unroll_R(4) + R1024_8_rounds(4); +#endif +#if R1024_Unroll_R(5) + R1024_8_rounds(5); +#endif +#if R1024_Unroll_R(6) + R1024_8_rounds(6); +#endif +#if R1024_Unroll_R(7) + R1024_8_rounds(7); +#endif +#if R1024_Unroll_R(8) + R1024_8_rounds(8); +#endif +#if R1024_Unroll_R(9) + R1024_8_rounds(9); +#endif +#if R1024_Unroll_R(10) + R1024_8_rounds(10); +#endif +#if R1024_Unroll_R(11) + R1024_8_rounds(11); +#endif +#if R1024_Unroll_R(12) + R1024_8_rounds(12); +#endif +#if R1024_Unroll_R(13) + R1024_8_rounds(13); +#endif +#if R1024_Unroll_R(14) + R1024_8_rounds(14); +#endif +#if(SKEIN_UNROLL_1024 > 14) +#error "need more unrolling in Skein_1024_Process_Block" +#endif } /* do the final "feedforward" xor, update context chaining vars */ - ctx->X[ 0] = X00 ^ w[ 0]; - ctx->X[ 1] = X01 ^ w[ 1]; - ctx->X[ 2] = X02 ^ w[ 2]; - ctx->X[ 3] = X03 ^ w[ 3]; - ctx->X[ 4] = X04 ^ w[ 4]; - ctx->X[ 5] = X05 ^ w[ 5]; - ctx->X[ 6] = X06 ^ w[ 6]; - ctx->X[ 7] = X07 ^ w[ 7]; - ctx->X[ 8] = X08 ^ w[ 8]; - ctx->X[ 9] = X09 ^ w[ 9]; + ctx->X[0] = X00 ^ w[0]; + ctx->X[1] = X01 ^ w[1]; + ctx->X[2] = X02 ^ w[2]; + ctx->X[3] = X03 ^ w[3]; + ctx->X[4] = X04 ^ w[4]; + ctx->X[5] = X05 ^ w[5]; + ctx->X[6] = X06 ^ w[6]; + ctx->X[7] = X07 ^ w[7]; + ctx->X[8] = X08 ^ w[8]; + ctx->X[9] = X09 ^ w[9]; ctx->X[10] = X10 ^ w[10]; ctx->X[11] = X11 ^ w[11]; ctx->X[12] = X12 ^ w[12]; @@ -1155,30 +1300,28 @@ static void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx,const u08b_t *blkPtr,s ctx->X[14] = X14 ^ w[14]; ctx->X[15] = X15 ^ w[15]; - Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X); + Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); ts[1] &= ~SKEIN_T1_FLAG_FIRST; blkPtr += SKEIN1024_BLOCK_BYTES; - } - while (--blkCnt); + } while(--blkCnt); ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; - } +} #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) static size_t Skein1024_Process_Block_CodeSize(void) - { - return ((u08b_t *) Skein1024_Process_Block_CodeSize) - - ((u08b_t *) Skein1024_Process_Block); - } +{ + return ((u08b_t*)Skein1024_Process_Block_CodeSize) - + ((u08b_t*)Skein1024_Process_Block); +} static uint_t Skein1024_Unroll_Cnt(void) - { +{ return SKEIN_UNROLL_1024; - } +} #endif #endif - #if 0 /*****************************************************************/ /* 256-bit Skein */ @@ -1289,93 +1432,93 @@ static int Skein_256_InitExt(Skein_256_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* process the input bytes */ -static int Skein_256_Update(Skein_256_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt) - { +static int Skein_256_Update(Skein_256_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt) +{ size_t n; - Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */ /* process full blocks, if any */ - if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES) + if(msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES) + { + if(ctx->h.bCnt) /* finish up any buffered message data */ { - if (ctx->h.bCnt) /* finish up any buffered message data */ + n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if(n) { - n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ - if (n) - { - Skein_assert(n < msgByteCnt); /* check on our logic here */ - memcpy(&ctx->b[ctx->h.bCnt],msg,n); - msgByteCnt -= n; - msg += n; + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt], msg, n); + msgByteCnt -= n; + msg += n; ctx->h.bCnt += n; - } + } Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES); - Skein_256_Process_Block(ctx,ctx->b,1,SKEIN_256_BLOCK_BYTES); + Skein_256_Process_Block(ctx, ctx->b, 1, SKEIN_256_BLOCK_BYTES); ctx->h.bCnt = 0; - } + } /* now process any remaining full blocks, directly from input message data */ - if (msgByteCnt > SKEIN_256_BLOCK_BYTES) - { - n = (msgByteCnt-1) / SKEIN_256_BLOCK_BYTES; /* number of full blocks to process */ - Skein_256_Process_Block(ctx,msg,n,SKEIN_256_BLOCK_BYTES); + if(msgByteCnt > SKEIN_256_BLOCK_BYTES) + { + n = (msgByteCnt - 1) / SKEIN_256_BLOCK_BYTES; /* number of full blocks to process */ + Skein_256_Process_Block(ctx, msg, n, SKEIN_256_BLOCK_BYTES); msgByteCnt -= n * SKEIN_256_BLOCK_BYTES; - msg += n * SKEIN_256_BLOCK_BYTES; - } - Skein_assert(ctx->h.bCnt == 0); + msg += n * SKEIN_256_BLOCK_BYTES; } + Skein_assert(ctx->h.bCnt == 0); + } /* copy any remaining source message data bytes into b[] */ - if (msgByteCnt) - { + if(msgByteCnt) + { Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES); - memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt); ctx->h.bCnt += msgByteCnt; - } + } return SKEIN_SUCCESS; - } +} /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* finalize the hash computation and output the result */ -static int Skein_256_Final(Skein_256_Ctxt_t *ctx, u08b_t *hashVal) - { - size_t i,n,byteCnt; +static int Skein_256_Final(Skein_256_Ctxt_t* ctx, u08b_t* hashVal) +{ + size_t i, n, byteCnt; u64b_t X[SKEIN_256_STATE_WORDS]; - Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */ - ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ - if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) /* zero pad b[] if necessary */ - memset(&ctx->b[ctx->h.bCnt],0,SKEIN_256_BLOCK_BYTES - ctx->h.bCnt); + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if(ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_256_BLOCK_BYTES - ctx->h.bCnt); - Skein_256_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); /* process the final block */ /* now output the result */ - byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ /* run Threefish in "counter mode" to generate output */ - memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ - memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ - for (i=0;i < byteCnt;i += SKEIN_256_BLOCK_BYTES) - { - ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ - Skein_Start_New_Type(ctx,OUT_FINAL); - Skein_256_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ - n = byteCnt - i; /* number of output bytes left to go */ - if (n >= SKEIN_256_BLOCK_BYTES) - n = SKEIN_256_BLOCK_BYTES; - Skein_Put64_LSB_First(hashVal+i,ctx->X,n); /* "output" the ctr mode bytes */ - Skein_Show_Final(256,&ctx->h,n,hashVal+i*SKEIN_256_BLOCK_BYTES); - memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ - } - return SKEIN_SUCCESS; + memset(ctx->b, 0, sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X, ctx->X, sizeof(X)); /* keep a local copy of counter mode "key" */ + for(i = 0; i < byteCnt; i += SKEIN_256_BLOCK_BYTES) + { + ((u64b_t*)ctx->b)[0] = Skein_Swap64((u64b_t)i); /* build the counter block */ + Skein_Start_New_Type(ctx, OUT_FINAL); + Skein_256_Process_Block(ctx, ctx->b, 1, sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i; /* number of output bytes left to go */ + if(n >= SKEIN_256_BLOCK_BYTES) + n = SKEIN_256_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i, ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256, &ctx->h, n, hashVal + i * SKEIN_256_BLOCK_BYTES); + memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */ } + return SKEIN_SUCCESS; +} #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) static size_t Skein_256_API_CodeSize(void) - { - return ((u08b_t *) Skein_256_API_CodeSize) - - ((u08b_t *) Skein_256_Init); - } +{ + return ((u08b_t*)Skein_256_API_CodeSize) - + ((u08b_t*)Skein_256_Init); +} #endif /*****************************************************************/ @@ -1384,47 +1527,54 @@ static size_t Skein_256_API_CodeSize(void) /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* init the context for a straight hashing operation */ -static int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen) - { - union - { - u08b_t b[SKEIN_512_STATE_BYTES]; - u64b_t w[SKEIN_512_STATE_WORDS]; - } cfg; /* config block */ +static int Skein_512_Init(Skein_512_Ctxt_t* ctx, size_t hashBitLen) +{ + union { + u08b_t b[SKEIN_512_STATE_BYTES]; + u64b_t w[SKEIN_512_STATE_WORDS]; + } cfg; /* config block */ - Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); - ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ - switch (hashBitLen) - { /* use pre-computed values, where available */ + switch(hashBitLen) + { /* use pre-computed values, where available */ #ifndef SKEIN_NO_PRECOMP - case 512: memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X)); break; - case 384: memcpy(ctx->X,SKEIN_512_IV_384,sizeof(ctx->X)); break; - case 256: memcpy(ctx->X,SKEIN_512_IV_256,sizeof(ctx->X)); break; - case 224: memcpy(ctx->X,SKEIN_512_IV_224,sizeof(ctx->X)); break; + case 512: + memcpy(ctx->X, SKEIN_512_IV_512, sizeof(ctx->X)); + break; + case 384: + memcpy(ctx->X, SKEIN_512_IV_384, sizeof(ctx->X)); + break; + case 256: + memcpy(ctx->X, SKEIN_512_IV_256, sizeof(ctx->X)); + break; + case 224: + memcpy(ctx->X, SKEIN_512_IV_224, sizeof(ctx->X)); + break; #endif - default: - /* here if there is no precomputed IV value available */ - /* build/process the config block, type == CONFIG (could be precomputed) */ - Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ - - cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ - cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ - cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); - memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */ - - /* compute the initial chaining values from config block */ - memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */ - Skein_512_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); - break; - } + default: + /* here if there is no precomputed IV value available */ + /* build/process the config block, type == CONFIG (could be precomputed) */ + Skein_Start_New_Type(ctx, CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ + + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + memset(&cfg.w[3], 0, sizeof(cfg) - 3 * sizeof(cfg.w[0])); /* zero pad config block */ + + /* compute the initial chaining values from config block */ + memset(ctx->X, 0, sizeof(ctx->X)); /* zero the chaining variables */ + Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + break; + } /* The chaining vars ctx->X are now initialized for the given hashBitLen. */ /* Set up to process the data message portion of the hash (default) */ - Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */ + Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */ return SKEIN_SUCCESS; - } +} #if 0 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ @@ -1489,93 +1639,93 @@ static int Skein_512_InitExt(Skein_512_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* process the input bytes */ -static int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt) - { +static int Skein_512_Update(Skein_512_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt) +{ size_t n; - Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */ /* process full blocks, if any */ - if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) + if(msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) + { + if(ctx->h.bCnt) /* finish up any buffered message data */ { - if (ctx->h.bCnt) /* finish up any buffered message data */ + n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if(n) { - n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ - if (n) - { - Skein_assert(n < msgByteCnt); /* check on our logic here */ - memcpy(&ctx->b[ctx->h.bCnt],msg,n); - msgByteCnt -= n; - msg += n; + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt], msg, n); + msgByteCnt -= n; + msg += n; ctx->h.bCnt += n; - } + } Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES); - Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES); + Skein_512_Process_Block(ctx, ctx->b, 1, SKEIN_512_BLOCK_BYTES); ctx->h.bCnt = 0; - } + } /* now process any remaining full blocks, directly from input message data */ - if (msgByteCnt > SKEIN_512_BLOCK_BYTES) - { - n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; /* number of full blocks to process */ - Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES); + if(msgByteCnt > SKEIN_512_BLOCK_BYTES) + { + n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES; /* number of full blocks to process */ + Skein_512_Process_Block(ctx, msg, n, SKEIN_512_BLOCK_BYTES); msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; - msg += n * SKEIN_512_BLOCK_BYTES; - } - Skein_assert(ctx->h.bCnt == 0); + msg += n * SKEIN_512_BLOCK_BYTES; } + Skein_assert(ctx->h.bCnt == 0); + } /* copy any remaining source message data bytes into b[] */ - if (msgByteCnt) - { + if(msgByteCnt) + { Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES); - memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt); ctx->h.bCnt += msgByteCnt; - } + } return SKEIN_SUCCESS; - } +} /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* finalize the hash computation and output the result */ -static int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) - { - size_t i,n,byteCnt; +static int Skein_512_Final(Skein_512_Ctxt_t* ctx, u08b_t* hashVal) +{ + size_t i, n, byteCnt; u64b_t X[SKEIN_512_STATE_WORDS]; - Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */ - ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ - if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */ - memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if(ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); - Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); /* process the final block */ /* now output the result */ - byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ /* run Threefish in "counter mode" to generate output */ - memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ - memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ - for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) - { - ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ - Skein_Start_New_Type(ctx,OUT_FINAL); - Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ - n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */ - if (n >= SKEIN_512_BLOCK_BYTES) - n = SKEIN_512_BLOCK_BYTES; - Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ - Skein_Show_Final(512,&ctx->h,n,hashVal+i*SKEIN_512_BLOCK_BYTES); - memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ - } - return SKEIN_SUCCESS; + memset(ctx->b, 0, sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X, ctx->X, sizeof(X)); /* keep a local copy of counter mode "key" */ + for(i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) + { + ((u64b_t*)ctx->b)[0] = Skein_Swap64((u64b_t)i); /* build the counter block */ + Skein_Start_New_Type(ctx, OUT_FINAL); + Skein_512_Process_Block(ctx, ctx->b, 1, sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i * SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */ + if(n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES, ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(512, &ctx->h, n, hashVal + i * SKEIN_512_BLOCK_BYTES); + memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */ } + return SKEIN_SUCCESS; +} #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) static size_t Skein_512_API_CodeSize(void) - { - return ((u08b_t *) Skein_512_API_CodeSize) - - ((u08b_t *) Skein_512_Init); - } +{ + return ((u08b_t*)Skein_512_API_CodeSize) - + ((u08b_t*)Skein_512_Init); +} #endif /*****************************************************************/ @@ -1583,46 +1733,51 @@ static size_t Skein_512_API_CodeSize(void) /*****************************************************************/ /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* init the context for a straight hashing operation */ -static int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen) - { - union - { - u08b_t b[SKEIN1024_STATE_BYTES]; - u64b_t w[SKEIN1024_STATE_WORDS]; - } cfg; /* config block */ +static int Skein1024_Init(Skein1024_Ctxt_t* ctx, size_t hashBitLen) +{ + union { + u08b_t b[SKEIN1024_STATE_BYTES]; + u64b_t w[SKEIN1024_STATE_WORDS]; + } cfg; /* config block */ - Skein_Assert(hashBitLen > 0,SKEIN_BAD_HASHLEN); - ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ - switch (hashBitLen) - { /* use pre-computed values, where available */ + switch(hashBitLen) + { /* use pre-computed values, where available */ #ifndef SKEIN_NO_PRECOMP - case 512: memcpy(ctx->X,SKEIN1024_IV_512 ,sizeof(ctx->X)); break; - case 384: memcpy(ctx->X,SKEIN1024_IV_384 ,sizeof(ctx->X)); break; - case 1024: memcpy(ctx->X,SKEIN1024_IV_1024,sizeof(ctx->X)); break; + case 512: + memcpy(ctx->X, SKEIN1024_IV_512, sizeof(ctx->X)); + break; + case 384: + memcpy(ctx->X, SKEIN1024_IV_384, sizeof(ctx->X)); + break; + case 1024: + memcpy(ctx->X, SKEIN1024_IV_1024, sizeof(ctx->X)); + break; #endif - default: - /* here if there is no precomputed IV value available */ - /* build/process the config block, type == CONFIG (could be precomputed) */ - Skein_Start_New_Type(ctx,CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ - - cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ - cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ - cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); - memset(&cfg.w[3],0,sizeof(cfg) - 3*sizeof(cfg.w[0])); /* zero pad config block */ - - /* compute the initial chaining values from config block */ - memset(ctx->X,0,sizeof(ctx->X)); /* zero the chaining variables */ - Skein1024_Process_Block(ctx,cfg.b,1,SKEIN_CFG_STR_LEN); - break; - } + default: + /* here if there is no precomputed IV value available */ + /* build/process the config block, type == CONFIG (could be precomputed) */ + Skein_Start_New_Type(ctx, CFG_FINAL); /* set tweaks: T0=0; T1=CFG | FINAL */ + + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); /* set the schema, version */ + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + memset(&cfg.w[3], 0, sizeof(cfg) - 3 * sizeof(cfg.w[0])); /* zero pad config block */ + + /* compute the initial chaining values from config block */ + memset(ctx->X, 0, sizeof(ctx->X)); /* zero the chaining variables */ + Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + break; + } /* The chaining vars ctx->X are now initialized for the given hashBitLen. */ /* Set up to process the data message portion of the hash (default) */ - Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */ + Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */ return SKEIN_SUCCESS; - } +} #if 0 /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ @@ -1687,93 +1842,93 @@ static int Skein1024_InitExt(Skein1024_Ctxt_t *ctx,size_t hashBitLen,u64b_t tree /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* process the input bytes */ -static int Skein1024_Update(Skein1024_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt) - { +static int Skein1024_Update(Skein1024_Ctxt_t* ctx, const u08b_t* msg, size_t msgByteCnt) +{ size_t n; - Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */ /* process full blocks, if any */ - if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES) + if(msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES) + { + if(ctx->h.bCnt) /* finish up any buffered message data */ { - if (ctx->h.bCnt) /* finish up any buffered message data */ + n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if(n) { - n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ - if (n) - { - Skein_assert(n < msgByteCnt); /* check on our logic here */ - memcpy(&ctx->b[ctx->h.bCnt],msg,n); - msgByteCnt -= n; - msg += n; + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt], msg, n); + msgByteCnt -= n; + msg += n; ctx->h.bCnt += n; - } + } Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES); - Skein1024_Process_Block(ctx,ctx->b,1,SKEIN1024_BLOCK_BYTES); + Skein1024_Process_Block(ctx, ctx->b, 1, SKEIN1024_BLOCK_BYTES); ctx->h.bCnt = 0; - } + } /* now process any remaining full blocks, directly from input message data */ - if (msgByteCnt > SKEIN1024_BLOCK_BYTES) - { - n = (msgByteCnt-1) / SKEIN1024_BLOCK_BYTES; /* number of full blocks to process */ - Skein1024_Process_Block(ctx,msg,n,SKEIN1024_BLOCK_BYTES); + if(msgByteCnt > SKEIN1024_BLOCK_BYTES) + { + n = (msgByteCnt - 1) / SKEIN1024_BLOCK_BYTES; /* number of full blocks to process */ + Skein1024_Process_Block(ctx, msg, n, SKEIN1024_BLOCK_BYTES); msgByteCnt -= n * SKEIN1024_BLOCK_BYTES; - msg += n * SKEIN1024_BLOCK_BYTES; - } - Skein_assert(ctx->h.bCnt == 0); + msg += n * SKEIN1024_BLOCK_BYTES; } + Skein_assert(ctx->h.bCnt == 0); + } /* copy any remaining source message data bytes into b[] */ - if (msgByteCnt) - { + if(msgByteCnt) + { Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES); - memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt); ctx->h.bCnt += msgByteCnt; - } + } return SKEIN_SUCCESS; - } +} /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* finalize the hash computation and output the result */ -static int Skein1024_Final(Skein1024_Ctxt_t *ctx, u08b_t *hashVal) - { - size_t i,n,byteCnt; +static int Skein1024_Final(Skein1024_Ctxt_t* ctx, u08b_t* hashVal) +{ + size_t i, n, byteCnt; u64b_t X[SKEIN1024_STATE_WORDS]; - Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); /* catch uninitialized context */ - ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ - if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) /* zero pad b[] if necessary */ - memset(&ctx->b[ctx->h.bCnt],0,SKEIN1024_BLOCK_BYTES - ctx->h.bCnt); + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if(ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt], 0, SKEIN1024_BLOCK_BYTES - ctx->h.bCnt); - Skein1024_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); /* process the final block */ /* now output the result */ - byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ /* run Threefish in "counter mode" to generate output */ - memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ - memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ - for (i=0;i*SKEIN1024_BLOCK_BYTES < byteCnt;i++) - { - ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ - Skein_Start_New_Type(ctx,OUT_FINAL); - Skein1024_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ - n = byteCnt - i*SKEIN1024_BLOCK_BYTES; /* number of output bytes left to go */ - if (n >= SKEIN1024_BLOCK_BYTES) - n = SKEIN1024_BLOCK_BYTES; - Skein_Put64_LSB_First(hashVal+i*SKEIN1024_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ - Skein_Show_Final(1024,&ctx->h,n,hashVal+i*SKEIN1024_BLOCK_BYTES); - memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ - } - return SKEIN_SUCCESS; + memset(ctx->b, 0, sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X, ctx->X, sizeof(X)); /* keep a local copy of counter mode "key" */ + for(i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) + { + ((u64b_t*)ctx->b)[0] = Skein_Swap64((u64b_t)i); /* build the counter block */ + Skein_Start_New_Type(ctx, OUT_FINAL); + Skein1024_Process_Block(ctx, ctx->b, 1, sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i * SKEIN1024_BLOCK_BYTES; /* number of output bytes left to go */ + if(n >= SKEIN1024_BLOCK_BYTES) + n = SKEIN1024_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES, ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(1024, &ctx->h, n, hashVal + i * SKEIN1024_BLOCK_BYTES); + memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */ } + return SKEIN_SUCCESS; +} #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) static size_t Skein1024_API_CodeSize(void) - { - return ((u08b_t *) Skein1024_API_CodeSize) - - ((u08b_t *) Skein1024_Init); - } +{ + return ((u08b_t*)Skein1024_API_CodeSize) - + ((u08b_t*)Skein1024_Init); +} #endif /**************** Functions to support MAC/tree hashing ***************/ @@ -1828,7 +1983,6 @@ static int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, u08b_t *hashVal) return SKEIN_SUCCESS; } - #if SKEIN_TREE_HASH /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* just do the OUTPUT stage */ @@ -1921,116 +2075,126 @@ static int Skein1024_Output(Skein1024_Ctxt_t *ctx, u08b_t *hashVal) typedef struct { - uint_t statebits; /* 256, 512, or 1024 */ - union - { - Skein_Ctxt_Hdr_t h; /* common header "overlay" */ - Skein_256_Ctxt_t ctx_256; - Skein_512_Ctxt_t ctx_512; - Skein1024_Ctxt_t ctx1024; - } u; -} -hashState; + uint_t statebits; /* 256, 512, or 1024 */ + union { + Skein_Ctxt_Hdr_t h; /* common header "overlay" */ + Skein_256_Ctxt_t ctx_256; + Skein_512_Ctxt_t ctx_512; + Skein1024_Ctxt_t ctx1024; + } u; +} hashState; /* "incremental" hashing API */ -static SkeinHashReturn Init (hashState *state, int hashbitlen); -static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen); -static SkeinHashReturn Final (hashState *state, SkeinBitSequence *hashval); +static SkeinHashReturn Init(hashState* state, int hashbitlen); +static SkeinHashReturn Update(hashState* state, const SkeinBitSequence* data, SkeinDataLength databitlen); +static SkeinHashReturn Final(hashState* state, SkeinBitSequence* hashval); /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* select the context size and init the context */ -static SkeinHashReturn Init(hashState *state, int hashbitlen) +static SkeinHashReturn Init(hashState* state, int hashbitlen) { #if SKEIN_256_NIST_MAX_HASH_BITS - if (hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS) - { - Skein_Assert(hashbitlen > 0,BAD_HASHLEN); - state->statebits = 64*SKEIN_256_STATE_WORDS; - return Skein_256_Init(&state->u.ctx_256,(size_t) hashbitlen); - } -#endif - if (hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS) - { - state->statebits = 64*SKEIN_512_STATE_WORDS; - return Skein_512_Init(&state->u.ctx_512,(size_t) hashbitlen); - } - else - { - state->statebits = 64*SKEIN1024_STATE_WORDS; - return Skein1024_Init(&state->u.ctx1024,(size_t) hashbitlen); - } + if(hashbitlen <= SKEIN_256_NIST_MAX_HASHBITS) + { + Skein_Assert(hashbitlen > 0, BAD_HASHLEN); + state->statebits = 64 * SKEIN_256_STATE_WORDS; + return Skein_256_Init(&state->u.ctx_256, (size_t)hashbitlen); + } +#endif + if(hashbitlen <= SKEIN_512_NIST_MAX_HASHBITS) + { + state->statebits = 64 * SKEIN_512_STATE_WORDS; + return Skein_512_Init(&state->u.ctx_512, (size_t)hashbitlen); + } + else + { + state->statebits = 64 * SKEIN1024_STATE_WORDS; + return Skein1024_Init(&state->u.ctx1024, (size_t)hashbitlen); + } } /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* process data to be hashed */ -static SkeinHashReturn Update(hashState *state, const SkeinBitSequence *data, SkeinDataLength databitlen) +static SkeinHashReturn Update(hashState* state, const SkeinBitSequence* data, SkeinDataLength databitlen) { - /* only the final Update() call is allowed do partial bytes, else assert an error */ - Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, SKEIN_FAIL); + /* only the final Update() call is allowed do partial bytes, else assert an error */ + Skein_Assert((state->u.h.T[1] & SKEIN_T1_FLAG_BIT_PAD) == 0 || databitlen == 0, SKEIN_FAIL); - Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,SKEIN_FAIL); - if ((databitlen & 7) == 0) /* partial bytes? */ - { - switch ((state->statebits >> 8) & 3) + Skein_Assert(state->statebits % 256 == 0 && (state->statebits - 256) < 1024, SKEIN_FAIL); + if((databitlen & 7) == 0) /* partial bytes? */ { - case 2: return Skein_512_Update(&state->u.ctx_512,data,databitlen >> 3); - case 1: return Skein_256_Update(&state->u.ctx_256,data,databitlen >> 3); - case 0: return Skein1024_Update(&state->u.ctx1024,data,databitlen >> 3); - default: return SKEIN_FAIL; + switch((state->statebits >> 8) & 3) + { + case 2: + return Skein_512_Update(&state->u.ctx_512, data, databitlen >> 3); + case 1: + return Skein_256_Update(&state->u.ctx_256, data, databitlen >> 3); + case 0: + return Skein1024_Update(&state->u.ctx1024, data, databitlen >> 3); + default: + return SKEIN_FAIL; + } } - } - else - { /* handle partial final byte */ - size_t bCnt = (databitlen >> 3) + 1; /* number of bytes to handle (nonzero here!) */ - u08b_t b,mask; + else + { /* handle partial final byte */ + size_t bCnt = (databitlen >> 3) + 1; /* number of bytes to handle (nonzero here!) */ + u08b_t b, mask; - mask = (u08b_t) (1u << (7 - (databitlen & 7))); /* partial byte bit mask */ - b = (u08b_t) ((data[bCnt-1] & (0-mask)) | mask); /* apply bit padding on final byte */ + mask = (u08b_t)(1u << (7 - (databitlen & 7))); /* partial byte bit mask */ + b = (u08b_t)((data[bCnt - 1] & (0 - mask)) | mask); /* apply bit padding on final byte */ - switch ((state->statebits >> 8) & 3) - { - case 2: Skein_512_Update(&state->u.ctx_512,data,bCnt-1); /* process all but the final byte */ - Skein_512_Update(&state->u.ctx_512,&b , 1 ); /* process the (masked) partial byte */ - break; - case 1: Skein_256_Update(&state->u.ctx_256,data,bCnt-1); /* process all but the final byte */ - Skein_256_Update(&state->u.ctx_256,&b , 1 ); /* process the (masked) partial byte */ - break; - case 0: Skein1024_Update(&state->u.ctx1024,data,bCnt-1); /* process all but the final byte */ - Skein1024_Update(&state->u.ctx1024,&b , 1 ); /* process the (masked) partial byte */ - break; - default: return SKEIN_FAIL; - } - Skein_Set_Bit_Pad_Flag(state->u.h); /* set tweak flag for the final call */ + switch((state->statebits >> 8) & 3) + { + case 2: + Skein_512_Update(&state->u.ctx_512, data, bCnt - 1); /* process all but the final byte */ + Skein_512_Update(&state->u.ctx_512, &b, 1); /* process the (masked) partial byte */ + break; + case 1: + Skein_256_Update(&state->u.ctx_256, data, bCnt - 1); /* process all but the final byte */ + Skein_256_Update(&state->u.ctx_256, &b, 1); /* process the (masked) partial byte */ + break; + case 0: + Skein1024_Update(&state->u.ctx1024, data, bCnt - 1); /* process all but the final byte */ + Skein1024_Update(&state->u.ctx1024, &b, 1); /* process the (masked) partial byte */ + break; + default: + return SKEIN_FAIL; + } + Skein_Set_Bit_Pad_Flag(state->u.h); /* set tweak flag for the final call */ - return SKEIN_SUCCESS; - } + return SKEIN_SUCCESS; + } } /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* finalize hash computation and output the result (hashbitlen bits) */ -static SkeinHashReturn Final(hashState *state, SkeinBitSequence *hashval) +static SkeinHashReturn Final(hashState* state, SkeinBitSequence* hashval) { - Skein_Assert(state->statebits % 256 == 0 && (state->statebits-256) < 1024,FAIL); - switch ((state->statebits >> 8) & 3) - { - case 2: return Skein_512_Final(&state->u.ctx_512,hashval); - case 1: return Skein_256_Final(&state->u.ctx_256,hashval); - case 0: return Skein1024_Final(&state->u.ctx1024,hashval); - default: return SKEIN_FAIL; - } + Skein_Assert(state->statebits % 256 == 0 && (state->statebits - 256) < 1024, FAIL); + switch((state->statebits >> 8) & 3) + { + case 2: + return Skein_512_Final(&state->u.ctx_512, hashval); + case 1: + return Skein_256_Final(&state->u.ctx_256, hashval); + case 0: + return Skein1024_Final(&state->u.ctx1024, hashval); + default: + return SKEIN_FAIL; + } } /*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ /* all-in-one hash function */ -SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence *data, /* all-in-one call */ - SkeinDataLength databitlen,SkeinBitSequence *hashval) +SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence* data, /* all-in-one call */ + SkeinDataLength databitlen, SkeinBitSequence* hashval) { - hashState state; - SkeinHashReturn r = Init(&state,hashbitlen); - if (r == SKEIN_SUCCESS) - { /* these calls do not fail when called properly */ - r = Update(&state,data,databitlen); - Final(&state,hashval); - } - return r; + hashState state; + SkeinHashReturn r = Init(&state, hashbitlen); + if(r == SKEIN_SUCCESS) + { /* these calls do not fail when called properly */ + r = Update(&state, data, databitlen); + Final(&state, hashval); + } + return r; } diff --git a/xmrstak/backend/cpu/crypto/c_skein.h b/xmrstak/backend/cpu/crypto/c_skein.h index 1aa11dea3..52f359e82 100644 --- a/xmrstak/backend/cpu/crypto/c_skein.h +++ b/xmrstak/backend/cpu/crypto/c_skein.h @@ -1,5 +1,5 @@ #ifndef _SKEIN_H_ -#define _SKEIN_H_ 1 +#define _SKEIN_H_ 1 /************************************************************************** ** ** Interface declarations and internal definitions for Skein hashing. @@ -27,21 +27,20 @@ ** 1: return SKEIN_FAIL to flag errors ** ***************************************************************************/ -#include "skein_port.h" /* get platform-specific definitions */ +#include "skein_port.h" /* get platform-specific definitions */ typedef enum { - SKEIN_SUCCESS = 0, /* return codes from Skein calls */ - SKEIN_FAIL = 1, - SKEIN_BAD_HASHLEN = 2 -} -SkeinHashReturn; + SKEIN_SUCCESS = 0, /* return codes from Skein calls */ + SKEIN_FAIL = 1, + SKEIN_BAD_HASHLEN = 2 +} SkeinHashReturn; -typedef uint32_t SkeinDataLength; /* bit count type */ -typedef u08b_t SkeinBitSequence; /* bit stream type */ +typedef uint32_t SkeinDataLength; /* bit count type */ +typedef u08b_t SkeinBitSequence; /* bit stream type */ /* "all-in-one" call */ -SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence *data, - SkeinDataLength databitlen, SkeinBitSequence *hashval); +SkeinHashReturn skein_hash(int hashbitlen, const SkeinBitSequence* data, + SkeinDataLength databitlen, SkeinBitSequence* hashval); -#endif /* ifndef _SKEIN_H_ */ +#endif /* ifndef _SKEIN_H_ */ diff --git a/xmrstak/backend/cpu/crypto/cn_gpu.hpp b/xmrstak/backend/cpu/crypto/cn_gpu.hpp index 5844d3814..2d333d118 100644 --- a/xmrstak/backend/cpu/crypto/cn_gpu.hpp +++ b/xmrstak/backend/cpu/crypto/cn_gpu.hpp @@ -4,8 +4,8 @@ #include #if defined(_WIN32) || defined(_WIN64) -#include #include +#include #define HAS_WIN_INTRIN_API #endif diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp index 79b38373a..efded74c8 100644 --- a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp +++ b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp @@ -1,12 +1,12 @@ -#include "cn_gpu.hpp" #include "../../cryptonight.hpp" +#include "cn_gpu.hpp" -#pragma GCC target ("avx2") +#pragma GCC target("avx2") #ifndef _mm256_bslli_epi128 - #define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count)) +#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count)) #endif #ifndef _mm256_bsrli_epi128 - #define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count)) +#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count)) #endif inline void prep_dv_avx(__m256i* idx, __m256i& v, __m256& n01) @@ -67,7 +67,7 @@ inline void round_compute(const __m256& n0, const __m256& n1, const __m256& n2, // 112×4 = 448 template inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, - float lcnt, float hcnt, const __m256& rnd_c, __m256& sum) + float lcnt, float hcnt, const __m256& rnd_c, __m256& sum) { __m256 c = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_set1_ps(lcnt)), _mm_set1_ps(hcnt), 1); __m256 r = _mm256_setzero_ps(); @@ -92,7 +92,7 @@ inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256& template inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, - float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out) + float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out) { __m256i r = double_comupte(n0, n1, n2, n3, lcnt, hcnt, rnd_c, sum); if(rot != 0) @@ -101,9 +101,7 @@ inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256 out = _mm256_xor_si256(out, r); } - -inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m256i*>(lpad + (idx & mask) + n*16); } - +inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m256i*>(lpad + (idx & mask) + n * 16); } void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo) { @@ -155,7 +153,7 @@ void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& al sum1 = _mm256_add_ps(suma, sumb); out2 = _mm256_xor_si256(out2, out); - out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2,out2,0x41), out2); + out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2, out2, 0x41), out2); suma = _mm256_permute2f128_ps(sum0, sum1, 0x30); sumb = _mm256_permute2f128_ps(sum0, sum1, 0x21); sum0 = _mm256_add_ps(suma, sumb); diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp index c8627d8b8..d65d9651e 100644 --- a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp +++ b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp @@ -1,7 +1,7 @@ -#include "cn_gpu.hpp" #include "../../cryptonight.hpp" +#include "cn_gpu.hpp" -#pragma GCC target ("sse2") +#pragma GCC target("sse2") inline void prep_dv(__m128i* idx, __m128i& v, __m128& n) { @@ -21,13 +21,13 @@ inline void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, { n1 = _mm_add_ps(n1, c); __m128 nn = _mm_mul_ps(n0, c); - nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn, nn)); nn = fma_break(nn); n = _mm_add_ps(n, nn); n3 = _mm_sub_ps(n3, c); __m128 dd = _mm_mul_ps(n2, c); - dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd, dd)); dd = fma_break(dd); d = _mm_add_ps(d, dd); @@ -57,12 +57,12 @@ inline void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 d = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFF7FFFFF)), d); d = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), d); - r =_mm_add_ps(r, _mm_div_ps(n,d)); + r = _mm_add_ps(r, _mm_div_ps(n, d)); } // 112×4 = 448 -template -inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) +template +inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum) { __m128 c = _mm_set1_ps(cnt); __m128 r = _mm_setzero_ps(); @@ -85,8 +85,8 @@ inline __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, floa return _mm_cvttps_epi32(r); } -template -inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) +template +inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) { __m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum); if(rot != 0) @@ -94,7 +94,7 @@ inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2, __m128 n3, flo out = _mm_xor_si128(out, r); } -inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m128i*>(lpad + (idx & mask) + n*16); } +inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m128i*>(lpad + (idx & mask) + n * 16); } void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo) { diff --git a/xmrstak/backend/cpu/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h index 488805ec0..c8b8320b0 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight.h +++ b/xmrstak/backend/cpu/crypto/cryptonight.h @@ -1,6 +1,6 @@ #pragma once -#include #include +#include #include "variant4_random_math.h" @@ -12,8 +12,8 @@ struct cryptonight_ctx; -typedef void (*cn_mainloop_fun)(cryptonight_ctx *ctx); -typedef void (*cn_double_mainloop_fun)(cryptonight_ctx*, cryptonight_ctx*); +typedef void (*cn_mainloop_fun)(cryptonight_ctx* ctx); +typedef void (*cn_double_mainloop_fun)(cryptonight_ctx*, cryptonight_ctx*); typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&); void v4_compile_code(size_t N, cryptonight_ctx* ctx, int code_size); @@ -36,11 +36,9 @@ struct cryptonight_ctx int asm_version = 0; xmrstak_algo last_algo = invalid_algo; - union - { + union { extra_ctx_r cn_r_ctx; }; - }; struct alloc_msg @@ -51,5 +49,3 @@ struct alloc_msg size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg); cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg); void cryptonight_free_ctx(cryptonight_ctx* ctx); - - diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index d7316b25e..6c9e3390c 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -15,22 +15,24 @@ */ #pragma once -#include "cryptonight.h" -#include "xmrstak/backend/cryptonight.hpp" #include "../../miner_work.hpp" #include "cn_gpu.hpp" +#include "cryptonight.h" +#include "xmrstak/backend/cryptonight.hpp" +#include #include #include -#include #include #ifdef _WIN64 -# include -# include -# include -# include +#include +// this comment disable clang include reordering +#include +#include +// this comment disable clang include reordering for windows.h +#include #else -# include +#include #endif #ifdef __GNUC__ @@ -54,9 +56,9 @@ static inline uint64_t _umul128(uint64_t a, uint64_t b, uint64_t* hi) extern "C" { - void keccak(const uint8_t *in, int inlen, uint8_t *md, int mdlen); + void keccak(const uint8_t* in, int inlen, uint8_t* md, int mdlen); void keccakf(uint64_t st[25], int rounds); - extern void(*const extra_hashes[4])(const void *, uint32_t, char *); + extern void (*const extra_hashes[4])(const void*, uint32_t, char*); } // This will shift and xor tmp1 into itself as 4 32-bit vals such as @@ -73,7 +75,7 @@ static inline __m128i sl_xor(__m128i tmp1) return tmp1; } -template +template static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2) { __m128i xout1 = _mm_aeskeygenassist_si128(*xout2, rcon); @@ -98,14 +100,14 @@ static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2, uint8_t r *xout2 = _mm_xor_si128(*xout2, xout1); } -template +template static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) { __m128i xout0, xout2; xout0 = _mm_load_si128(memory); - xout2 = _mm_load_si128(memory+1); + xout2 = _mm_load_si128(memory + 1); *k0 = xout0; *k1 = xout2; @@ -175,7 +177,7 @@ inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3 x7 = _mm_xor_si128(x7, tmp0); } -template +template void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo) { constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast; @@ -197,7 +199,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_ if(HEAVY_MIX) { - for(size_t i=0; i < 16; i++) + for(size_t i = 0; i < 16; i++) { if(SOFT_AES) { @@ -230,7 +232,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_ } const size_t MEM = algo.Mem(); - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + for(size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { if(SOFT_AES) { @@ -277,29 +279,29 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_ } } -template +template void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output, const xmrstak_algo& algo) { constexpr size_t hash_size = 200; // 25x8 bytes alignas(128) uint64_t hash[25]; const size_t mem = algo.Mem(); - for (uint64_t i = 0; i < mem / 512; i++) + for(uint64_t i = 0; i < mem / 512; i++) { memcpy(hash, input, hash_size); hash[0] ^= i; keccakf(hash, 24); memcpy(output, hash, 160); - output+=160; + output += 160; keccakf(hash, 24); memcpy(output, hash, 176); - output+=176; + output += 176; keccakf(hash, 24); memcpy(output, hash, 176); - output+=176; + output += 176; if(PREFETCH) { @@ -311,11 +313,11 @@ void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output, const xmrs } } -template +template void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo) { constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || - ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu; + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu; // This is more than we have registers, compiler will assign 2 keys on the stack __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; @@ -333,7 +335,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_ xout7 = _mm_load_si128(output + 11); const size_t MEM = algo.Mem(); - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + for(size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { if(PREFETCH) _mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA); @@ -384,7 +386,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_ if(HEAVY_MIX) { - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + for(size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { if(PREFETCH) _mm_prefetch((const char*)input + i + 0, _MM_HINT_NTA); @@ -433,7 +435,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_ mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } - for(size_t i=0; i < 16; i++) + for(size_t i = 0; i < 16; i++) { if(SOFT_AES) { @@ -494,7 +496,8 @@ inline uint64_t int_sqrt33_1_double_precision(const uint64_t n0) #else // GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence // Fallback to simpler code - if (x2 < n0) ++r; + if(x2 < n0) + ++r; #endif return r; } @@ -505,7 +508,7 @@ inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key) alignas(16) uint32_t x[4]; _mm_store_si128((__m128i*)k, key); _mm_store_si128((__m128i*)x, _mm_xor_si128(val, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))); // x = ~val - #define BYTE(p, i) ((unsigned char*)&p)[i] +#define BYTE(p, i) ((unsigned char*)&p)[i] k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)]; x[0] ^= k[0]; k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)]; @@ -513,11 +516,11 @@ inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key) k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)]; x[2] ^= k[2]; k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)]; - #undef BYTE +#undef BYTE return _mm_load_si128((__m128i*)k); } -template +template inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) { mem_out[0] = _mm_cvtsi128_si64(tmp); @@ -541,7 +544,6 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) mem_out[1] = vh; } - } /** optimal type for sqrt @@ -550,18 +552,18 @@ inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) * * @tparam N number of hashes per thread */ -template +template struct GetOptimalSqrtType { using type = __m128i; }; -template<> +template <> struct GetOptimalSqrtType<1u> { using type = uint64_t; }; -template +template using GetOptimalSqrtType_t = typename GetOptimalSqrtType::type; /** assign a value and convert if necessary @@ -625,273 +627,275 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc)); } -#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx) \ - /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ +#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx) \ + /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \ - { \ - const uint64_t idx1 = idx0 & MASK; \ - const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \ - const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ - const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ - if (ALGO == cryptonight_r) \ - cx = _mm_xor_si128(_mm_xor_si128(cx, chunk3), _mm_xor_si128(chunk1, chunk2)); \ - } \ - if(ALGO == cryptonight_v8_reversewaltz) \ - { \ - const uint64_t idx1 = idx0 & MASK; \ - const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \ - const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ - const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk1 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]); \ + const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]); \ + const __m128i chunk3 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + if(ALGO == cryptonight_r) \ + cx = _mm_xor_si128(_mm_xor_si128(cx, chunk3), _mm_xor_si128(chunk1, chunk2)); \ + } \ + if(ALGO == cryptonight_v8_reversewaltz) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk3 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]); \ + const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]); \ + const __m128i chunk1 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ } -#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \ - /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ - if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r_wow) \ - { \ - const uint64_t idx1 = idx0 & MASK; \ - const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ - const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ - hi ^= ((uint64_t*)&chunk2)[0]; \ - lo ^= ((uint64_t*)&chunk2)[1]; \ - const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ - } \ - if(ALGO == cryptonight_v8_reversewaltz) \ - { \ - const uint64_t idx1 = idx0 & MASK; \ - const __m128i chunk3 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ - const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ - hi ^= ((uint64_t*)&chunk2)[0]; \ - lo ^= ((uint64_t*)&chunk2)[1]; \ - const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ - _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ +#define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \ + /* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r_wow) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ + const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]); \ + hi ^= ((uint64_t*)&chunk2)[0]; \ + lo ^= ((uint64_t*)&chunk2)[1]; \ + const __m128i chunk3 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + } \ + if(ALGO == cryptonight_v8_reversewaltz) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk3 = _mm_xor_si128(_mm_load_si128((__m128i*)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ + const __m128i chunk2 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x20]); \ + hi ^= ((uint64_t*)&chunk2)[0]; \ + lo ^= ((uint64_t*)&chunk2)[1]; \ + const __m128i chunk1 = _mm_load_si128((__m128i*)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i*)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ } -#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \ - if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \ - { \ - uint64_t sqrt_result_tmp; \ - assign(sqrt_result_tmp, sqrt_result); \ - /* Use division and square root results from the _previous_ iteration to hide the latency */ \ - const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \ - cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \ - const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \ - /* Most and least significant bits in the divisor are set to 1 \ - * to make sure we don't divide by a small or even number, \ - * so there are no shortcuts for such cases \ - * \ - * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \ - * We drop the highest bit to fit both quotient and remainder in 32 bits \ - */ \ - /* Compiler will optimize it to a single div instruction */ \ - const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ - const uint64_t division_result = static_cast(cx_s / d) + ((cx_s % d) << 32); \ - division_result_xmm = _mm_cvtsi64_si128(static_cast(division_result)); \ +#define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \ + { \ + uint64_t sqrt_result_tmp; \ + assign(sqrt_result_tmp, sqrt_result); \ + /* Use division and square root results from the _previous_ iteration to hide the latency */ \ + const uint64_t cx_64 = _mm_cvtsi128_si64(cx); \ + cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result_tmp << 32); \ + const uint32_t d = (cx_64 + (sqrt_result_tmp << 1)) | 0x80000001UL; \ + /* Most and least significant bits in the divisor are set to 1 \ + * to make sure we don't divide by a small or even number, \ + * so there are no shortcuts for such cases \ + * \ + * Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 \ + * We drop the highest bit to fit both quotient and remainder in 32 bits \ + */ \ + /* Compiler will optimize it to a single div instruction */ \ + const uint64_t cx_s = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ + const uint64_t division_result = static_cast(cx_s / d) + ((cx_s % d) << 32); \ + division_result_xmm = _mm_cvtsi64_si128(static_cast(division_result)); \ /* Use division_result as an input for the square root to prevent parallel implementation in hardware */ \ - assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \ + assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \ } -#define CN_R_RANDOM_MATH(n, al, ah, cl, bx0, bx1, cn_r_data) \ - if (ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \ - { \ +#define CN_R_RANDOM_MATH(n, al, ah, cl, bx0, bx1, cn_r_data) \ + if(ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \ + { \ cl ^= (cn_r_data[0] + cn_r_data[1]) | ((uint64_t)(cn_r_data[2] + cn_r_data[3]) << 32); \ - cn_r_data[4] = static_cast(al); \ - cn_r_data[5] = static_cast(ah); \ - cn_r_data[6] = static_cast(_mm_cvtsi128_si32(bx0)); \ - cn_r_data[7] = static_cast(_mm_cvtsi128_si32(bx1)); \ - cn_r_data[8] = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \ - v4_random_math(ctx[n]->cn_r_ctx.code, cn_r_data); \ - } \ - if (ALGO == cryptonight_r) \ - { \ - al ^= cn_r_data[2] | ((uint64_t)(cn_r_data[3]) << 32); \ - ah ^= cn_r_data[0] | ((uint64_t)(cn_r_data[1]) << 32); \ + cn_r_data[4] = static_cast(al); \ + cn_r_data[5] = static_cast(ah); \ + cn_r_data[6] = static_cast(_mm_cvtsi128_si32(bx0)); \ + cn_r_data[7] = static_cast(_mm_cvtsi128_si32(bx1)); \ + cn_r_data[8] = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \ + v4_random_math(ctx[n]->cn_r_ctx.code, cn_r_data); \ + } \ + if(ALGO == cryptonight_r) \ + { \ + al ^= cn_r_data[2] | ((uint64_t)(cn_r_data[3]) << 32); \ + ah ^= cn_r_data[0] | ((uint64_t)(cn_r_data[1]) << 32); \ } -#define CN_INIT_SINGLE \ +#define CN_INIT_SINGLE \ if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \ - { \ - memset(output, 0, 32 * N); \ - return; \ + { \ + memset(output, 0, 32 * N); \ + return; \ } -#define CN_INIT(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data) \ - keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \ - uint64_t monero_const; \ +#define CN_INIT(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data) \ + keccak((const uint8_t*)input + len * n, len, ctx[n]->hash_state, 200); \ + uint64_t monero_const; \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ - { \ - monero_const = *reinterpret_cast(reinterpret_cast(input) + len * n + 35); \ - monero_const ^= *(reinterpret_cast(ctx[n]->hash_state) + 24); \ - } \ - /* Optim - 99% time boundary */ \ - cn_explode_scratchpad((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state, algo); \ - \ - __m128i ax0; \ - uint64_t idx0; \ - __m128i bx0; \ - uint8_t* l0 = ctx[n]->long_state; \ - /* BEGIN cryptonight_monero_v8 variables */ \ - __m128i bx1; \ - __m128i division_result_xmm; \ - __m128 conc_var; \ - if(ALGO == cryptonight_conceal) \ - {\ - set_float_rounding_mode_nearest(); \ - conc_var = _mm_setzero_ps(); \ - }\ - GetOptimalSqrtType_t sqrt_result; \ - uint32_t cn_r_data[9]; \ - /* END cryptonight_monero_v8 variables */ \ - { \ - uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ - idx0 = h0[0] ^ h0[4]; \ - ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \ - bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \ - if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \ - { \ - bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ - division_result_xmm = _mm_cvtsi64_si128(h0[12]); \ - assign(sqrt_result, h0[13]); \ - set_float_rounding_mode(); \ - } \ - if (ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \ - { \ - bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ - cn_r_data[0] = (uint32_t)(h0[12]); \ - cn_r_data[1] = (uint32_t)(h0[12] >> 32); \ - cn_r_data[2] = (uint32_t)(h0[13]); \ - cn_r_data[3] = (uint32_t)(h0[13] >> 32); \ - } \ - } \ - __m128i *ptr0 + { \ + monero_const = *reinterpret_cast(reinterpret_cast(input) + len * n + 35); \ + monero_const ^= *(reinterpret_cast(ctx[n]->hash_state) + 24); \ + } \ + /* Optim - 99% time boundary */ \ + cn_explode_scratchpad((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state, algo); \ + \ + __m128i ax0; \ + uint64_t idx0; \ + __m128i bx0; \ + uint8_t* l0 = ctx[n]->long_state; \ + /* BEGIN cryptonight_monero_v8 variables */ \ + __m128i bx1; \ + __m128i division_result_xmm; \ + __m128 conc_var; \ + if(ALGO == cryptonight_conceal) \ + { \ + set_float_rounding_mode_nearest(); \ + conc_var = _mm_setzero_ps(); \ + } \ + GetOptimalSqrtType_t sqrt_result; \ + uint32_t cn_r_data[9]; \ + /* END cryptonight_monero_v8 variables */ \ + { \ + uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \ + idx0 = h0[0] ^ h0[4]; \ + ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \ + bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \ + { \ + bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ + division_result_xmm = _mm_cvtsi64_si128(h0[12]); \ + assign(sqrt_result, h0[13]); \ + set_float_rounding_mode(); \ + } \ + if(ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \ + { \ + bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ + cn_r_data[0] = (uint32_t)(h0[12]); \ + cn_r_data[1] = (uint32_t)(h0[12] >> 32); \ + cn_r_data[2] = (uint32_t)(h0[13]); \ + cn_r_data[3] = (uint32_t)(h0[13] >> 32); \ + } \ + } \ + __m128i* ptr0 #define CN_STEP1(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1) \ - __m128i cx; \ - ptr0 = (__m128i *)&l0[idx0 & MASK]; \ - cx = _mm_load_si128(ptr0); \ - if (ALGO == cryptonight_conceal) \ - cryptonight_conceal_tweak(cx, conc_var); \ - if (ALGO == cryptonight_bittube2) \ - { \ - cx = aes_round_bittube2(cx, ax0); \ - } \ - else \ - { \ - if(SOFT_AES) \ - cx = soft_aesenc(cx, ax0); \ - else \ - cx = _mm_aesenc_si128(cx, ax0); \ - } \ + __m128i cx; \ + ptr0 = (__m128i*)&l0[idx0 & MASK]; \ + cx = _mm_load_si128(ptr0); \ + if(ALGO == cryptonight_conceal) \ + cryptonight_conceal_tweak(cx, conc_var); \ + if(ALGO == cryptonight_bittube2) \ + { \ + cx = aes_round_bittube2(cx, ax0); \ + } \ + else \ + { \ + if(SOFT_AES) \ + cx = soft_aesenc(cx, ax0); \ + else \ + cx = _mm_aesenc_si128(cx, ax0); \ + } \ CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx) -#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ +#define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \ if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ - cryptonight_monero_tweak((uint64_t*)ptr0, _mm_xor_si128(bx0, cx)); \ - else \ - _mm_store_si128((__m128i *)ptr0, _mm_xor_si128(bx0, cx)); \ - idx0 = _mm_cvtsi128_si64(cx); \ - \ - ptr0 = (__m128i *)&l0[idx0 & MASK]; \ - if(PREFETCH) \ - _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ - if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_r && ALGO != cryptonight_r_wow && ALGO != cryptonight_v8_reversewaltz) \ - bx0 = cx + cryptonight_monero_tweak((uint64_t*)ptr0, _mm_xor_si128(bx0, cx)); \ + else \ + _mm_store_si128((__m128i*)ptr0, _mm_xor_si128(bx0, cx)); \ + idx0 = _mm_cvtsi128_si64(cx); \ + \ + ptr0 = (__m128i*)&l0[idx0 & MASK]; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ + if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_r && ALGO != cryptonight_r_wow && ALGO != cryptonight_v8_reversewaltz) \ + bx0 = cx #define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data) \ - uint64_t lo, cl, ch; \ - uint64_t al0 = _mm_cvtsi128_si64(ax0); \ - uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ - cl = ((uint64_t*)ptr0)[0]; \ - ch = ((uint64_t*)ptr0)[1]; \ - CN_R_RANDOM_MATH(n, al0, ah0, cl, bx0, bx1, cn_r_data); \ - CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \ - { \ - uint64_t hi; \ - lo = _umul128(idx0, cl, &hi); \ - if(ALGO == cryptonight_r) \ - { \ - CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx); \ - } \ - else \ - { \ - CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \ - } \ - ah0 += lo; \ - al0 += hi; \ - } \ - if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow || ALGO == cryptonight_v8_reversewaltz) \ - { \ - bx1 = bx0; \ - bx0 = cx; \ - } \ - ((uint64_t*)ptr0)[0] = al0; \ - if(PREFETCH) \ - _mm_prefetch((const char*)ptr0, _MM_HINT_T0) - -#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ - if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ - { \ - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \ - ((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0]; \ - else \ - ((uint64_t*)ptr0)[1] = ah0 ^ monero_const; \ - } \ - else \ - ((uint64_t*)ptr0)[1] = ah0; \ - al0 ^= cl; \ - ah0 ^= ch; \ - ax0 = _mm_set_epi64x(ah0, al0); \ + uint64_t lo, cl, ch; \ + uint64_t al0 = _mm_cvtsi128_si64(ax0); \ + uint64_t ah0 = ((uint64_t*)&ax0)[1]; \ + cl = ((uint64_t*)ptr0)[0]; \ + ch = ((uint64_t*)ptr0)[1]; \ + CN_R_RANDOM_MATH(n, al0, ah0, cl, bx0, bx1, cn_r_data); \ + CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \ + { \ + uint64_t hi; \ + lo = _umul128(idx0, cl, &hi); \ + if(ALGO == cryptonight_r) \ + { \ + CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx); \ + } \ + else \ + { \ + CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \ + } \ + ah0 += lo; \ + al0 += hi; \ + } \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow || ALGO == cryptonight_v8_reversewaltz) \ + { \ + bx1 = bx0; \ + bx0 = cx; \ + } \ + ((uint64_t*)ptr0)[0] = al0; \ + if(PREFETCH) \ + _mm_prefetch((const char*)ptr0, _MM_HINT_T0) + +#define CN_STEP4(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0) \ + if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \ + { \ + if(ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) \ + ((uint64_t*)ptr0)[1] = ah0 ^ monero_const ^ ((uint64_t*)ptr0)[0]; \ + else \ + ((uint64_t*)ptr0)[1] = ah0 ^ monero_const; \ + } \ + else \ + ((uint64_t*)ptr0)[1] = ah0; \ + al0 ^= cl; \ + ah0 ^= ch; \ + ax0 = _mm_set_epi64x(ah0, al0); \ idx0 = al0; -#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0) \ - if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \ - { \ - ptr0 = (__m128i *)&l0[idx0 & MASK]; \ - int64_t u = ((int64_t*)ptr0)[0]; \ - int32_t d = ((int32_t*)ptr0)[2]; \ - int64_t q = u / (d | 0x5); \ - \ - ((int64_t*)ptr0)[0] = u ^ q; \ - idx0 = d ^ q; \ - } \ +#define CN_STEP5(n, monero_const, l0, ax0, bx0, idx0, ptr0) \ + if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) \ + { \ + ptr0 = (__m128i*)&l0[idx0 & MASK]; \ + int64_t u = ((int64_t*)ptr0)[0]; \ + int32_t d = ((int32_t*)ptr0)[2]; \ + int64_t q = u / (d | 0x5); \ + \ + ((int64_t*)ptr0)[0] = u ^ q; \ + idx0 = d ^ q; \ + } \ else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast) \ - { \ - ptr0 = (__m128i *)&l0[idx0 & MASK]; \ - int64_t u = ((int64_t*)ptr0)[0]; \ - int32_t d = ((int32_t*)ptr0)[2]; \ - int64_t q = u / (d | 0x5); \ - \ - ((int64_t*)ptr0)[0] = u ^ q; \ - idx0 = (~d) ^ q; \ + { \ + ptr0 = (__m128i*)&l0[idx0 & MASK]; \ + int64_t u = ((int64_t*)ptr0)[0]; \ + int32_t d = ((int32_t*)ptr0)[2]; \ + int64_t q = u / (d | 0x5); \ + \ + ((int64_t*)ptr0)[0] = u ^ q; \ + idx0 = (~d) ^ q; \ } -#define CN_FINALIZE(n) \ - /* Optim - 90% time boundary */ \ +#define CN_FINALIZE(n) \ + /* Optim - 90% time boundary */ \ cn_implode_scratchpad((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state, algo); \ - /* Optim - 99% time boundary */ \ - keccakf((uint64_t*)ctx[n]->hash_state, 24); \ + /* Optim - 99% time boundary */ \ + keccakf((uint64_t*)ctx[n]->hash_state, 24); \ extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n) //! defer the evaluation of an macro #ifndef _MSC_VER -# define CN_DEFER(...) __VA_ARGS__ +#define CN_DEFER(...) __VA_ARGS__ #else -# define CN_EMPTY(...) -# define CN_DEFER(...) __VA_ARGS__ CN_EMPTY() +#define CN_EMPTY(...) +#define CN_DEFER(...) __VA_ARGS__ CN_EMPTY() #endif //! execute the macro f with the passed arguments -#define CN_EXEC(f,...) CN_DEFER(f)(__VA_ARGS__) +#define CN_EXEC(f, ...) \ + CN_DEFER(f) \ + (__VA_ARGS__) /** add append n to all arguments and keeps n as first argument * @@ -904,22 +908,22 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) * @endcode */ #define CN_ENUM_0(n, ...) n -#define CN_ENUM_1(n, x1) n, x1 ## n -#define CN_ENUM_2(n, x1, x2) n, x1 ## n, x2 ## n -#define CN_ENUM_3(n, x1, x2, x3) n, x1 ## n, x2 ## n, x3 ## n -#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n -#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n -#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n -#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n -#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n -#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n -#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n -#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n -#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n -#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n -#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n -#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n -#define CN_ENUM_16(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n, x16 ## n +#define CN_ENUM_1(n, x1) n, x1##n +#define CN_ENUM_2(n, x1, x2) n, x1##n, x2##n +#define CN_ENUM_3(n, x1, x2, x3) n, x1##n, x2##n, x3##n +#define CN_ENUM_4(n, x1, x2, x3, x4) n, x1##n, x2##n, x3##n, x4##n +#define CN_ENUM_5(n, x1, x2, x3, x4, x5) n, x1##n, x2##n, x3##n, x4##n, x5##n +#define CN_ENUM_6(n, x1, x2, x3, x4, x5, x6) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n +#define CN_ENUM_7(n, x1, x2, x3, x4, x5, x6, x7) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n +#define CN_ENUM_8(n, x1, x2, x3, x4, x5, x6, x7, x8) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n +#define CN_ENUM_9(n, x1, x2, x3, x4, x5, x6, x7, x8, x9) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n +#define CN_ENUM_10(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n +#define CN_ENUM_11(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n +#define CN_ENUM_12(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n +#define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n +#define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n, x14##n +#define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n, x14##n, x15##n +#define CN_ENUM_16(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) n, x1##n, x2##n, x3##n, x4##n, x5##n, x6##n, x7##n, x8##n, x9##n, x10##n, x11##n, x12##n, x13##n, x14##n, x15##n, x16##n /** repeat a macro call multiple times * @@ -933,21 +937,35 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) * f(0, foo0, bar); f(1, foo1, bar1) * @endcode */ -#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)) -#define REPEAT_2(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)) -#define REPEAT_3(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)) -#define REPEAT_4(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)) -#define REPEAT_5(n, f, ...) CN_EXEC(f, CN_ENUM_ ## n(0, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(1, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(2, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(3, __VA_ARGS__)); CN_EXEC(f, CN_ENUM_ ## n(4, __VA_ARGS__)) - -template< size_t N> +#define REPEAT_1(n, f, ...) CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)) +#define REPEAT_2(n, f, ...) \ + CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)) +#define REPEAT_3(n, f, ...) \ + CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(2, __VA_ARGS__)) +#define REPEAT_4(n, f, ...) \ + CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(2, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(3, __VA_ARGS__)) +#define REPEAT_5(n, f, ...) \ + CN_EXEC(f, CN_ENUM_##n(0, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(1, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(2, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(3, __VA_ARGS__)); \ + CN_EXEC(f, CN_ENUM_##n(4, __VA_ARGS__)) + +template struct Cryptonight_hash; -template< > +template <> struct Cryptonight_hash<1> { static constexpr size_t N = 1; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const uint32_t MASK = algo.Mask(); @@ -971,12 +989,12 @@ struct Cryptonight_hash<1> } }; -template< > +template <> struct Cryptonight_hash<2> { static constexpr size_t N = 2; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const uint32_t MASK = algo.Mask(); @@ -1000,12 +1018,12 @@ struct Cryptonight_hash<2> } }; -template< > +template <> struct Cryptonight_hash<3> { static constexpr size_t N = 3; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const uint32_t MASK = algo.Mask(); @@ -1029,12 +1047,12 @@ struct Cryptonight_hash<3> } }; -template< > +template <> struct Cryptonight_hash<4> { static constexpr size_t N = 4; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const uint32_t MASK = algo.Mask(); @@ -1058,12 +1076,12 @@ struct Cryptonight_hash<4> } }; -template< > +template <> struct Cryptonight_hash<5> { static constexpr size_t N = 5; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const uint32_t MASK = algo.Mask(); @@ -1087,26 +1105,25 @@ struct Cryptonight_hash<5> } }; -extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); -extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonight_v8_mainloop_ivybridge_asm(cryptonight_ctx* ctx0); +extern "C" void cryptonight_v8_mainloop_ryzen_asm(cryptonight_ctx* ctx0); extern "C" void cryptonight_v8_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); - -template< size_t N, size_t asm_version> +template struct Cryptonight_hash_asm { - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { for(size_t i = 0; i < N; ++i) { - keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + keccak((const uint8_t*)input + len * i, len, ctx[i]->hash_state, 200); cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state, algo); } if(ALGO == cryptonight_r) { // API ATTRIBUTE is only required for cryptonight_r - typedef void ABI_ATTRIBUTE (*cn_r_mainloop_fun)(cryptonight_ctx *ctx); + typedef void ABI_ATTRIBUTE (*cn_r_mainloop_fun)(cryptonight_ctx * ctx); for(size_t i = 0; i < N; ++i) reinterpret_cast(ctx[0]->loop_fn)(ctx[i]); // use always loop_fn from ctx[0]!! } @@ -1126,19 +1143,19 @@ struct Cryptonight_hash_asm }; // double hash with specialized asm only for intel -template< > +template <> struct Cryptonight_hash_asm<2, 0> { static constexpr size_t N = 2; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const size_t MEM = algo.Mem(); for(size_t i = 0; i < N; ++i) { - keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200); + keccak((const uint8_t*)input + len * i, len, ctx[i]->hash_state, 200); /* Optim - 99% time boundary */ cn_explode_scratchpad((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state, algo); } @@ -1167,89 +1184,90 @@ struct Cryptonight_hash_asm<2, 0> namespace { -template +template static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t mask) { - const uint8_t* p = reinterpret_cast(src); - - // Workaround for Visual Studio placing trampoline in debug builds. -# if defined(_MSC_VER) - if (p[0] == 0xE9) { - p += *(int32_t*)(p + 1) + 5; - } -# endif - - size_t size = 0; - while (*(uint32_t*)(p + size) != 0xDEADC0DE) { - ++size; - } - size += sizeof(uint32_t); - - memcpy((void*) dst, (const void*) src, size); - - uint8_t* patched_data = reinterpret_cast(dst); - for (size_t i = 0; i + sizeof(uint32_t) <= size; ++i) { - switch (*(uint32_t*)(patched_data + i)) { - case CN_ITER: - *(uint32_t*)(patched_data + i) = iterations; - break; - - case CN_MASK: - *(uint32_t*)(patched_data + i) = mask; - break; - } - } -} + const uint8_t* p = reinterpret_cast(src); + + // Workaround for Visual Studio placing trampoline in debug builds. +#if defined(_MSC_VER) + if(p[0] == 0xE9) + { + p += *(int32_t*)(p + 1) + 5; + } +#endif + + size_t size = 0; + while(*(uint32_t*)(p + size) != 0xDEADC0DE) + { + ++size; + } + size += sizeof(uint32_t); + + memcpy((void*)dst, (const void*)src, size); + + uint8_t* patched_data = reinterpret_cast(dst); + for(size_t i = 0; i + sizeof(uint32_t) <= size; ++i) + { + switch(*(uint32_t*)(patched_data + i)) + { + case CN_ITER: + *(uint32_t*)(patched_data + i) = iterations; + break; + case CN_MASK: + *(uint32_t*)(patched_data + i) = mask; + break; + } + } +} void* allocateExecutableMemory(size_t size) { #ifdef _WIN64 -return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE); + return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE); +#else +#if defined(__APPLE__) + return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0); #else -# if defined(__APPLE__) - return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0); -# else - return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); -# endif + return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +#endif #endif } - -void protectExecutableMemory(void *p, size_t size) +void protectExecutableMemory(void* p, size_t size) { #ifdef _WIN64 - DWORD oldProtect; - VirtualProtect(p, size, PAGE_EXECUTE_READ, &oldProtect); + DWORD oldProtect; + VirtualProtect(p, size, PAGE_EXECUTE_READ, &oldProtect); #else - mprotect(p, size, PROT_READ | PROT_EXEC); + mprotect(p, size, PROT_READ | PROT_EXEC); #endif } -void unprotectExecutableMemory(void *p, size_t size) +void unprotectExecutableMemory(void* p, size_t size) { #ifdef _WIN64 - DWORD oldProtect; - VirtualProtect(p, size, PAGE_EXECUTE_READWRITE, &oldProtect); + DWORD oldProtect; + VirtualProtect(p, size, PAGE_EXECUTE_READWRITE, &oldProtect); #else - mprotect(p, size, PROT_WRITE | PROT_EXEC); + mprotect(p, size, PROT_WRITE | PROT_EXEC); #endif } - -void flushInstructionCache(void *p, size_t size) +void flushInstructionCache(void* p, size_t size) { #ifdef _WIN64 - ::FlushInstructionCache(GetCurrentProcess(), p, size); + ::FlushInstructionCache(GetCurrentProcess(), p, size); #else -# ifndef __FreeBSD__ - __builtin___clear_cache(reinterpret_cast(p), reinterpret_cast(p) + size); -# endif +#ifndef __FreeBSD__ + __builtin___clear_cache(reinterpret_cast(p), reinterpret_cast(p) + size); +#endif #endif } -template +template void patchAsmVariants(std::string selected_asm, cryptonight_ctx** ctx, const xmrstak_algo& algo) { const uint32_t Iter = algo.Iter(); @@ -1270,7 +1288,8 @@ void patchAsmVariants(std::string selected_asm, cryptonight_ctx** ctx, const xmr if(N == 2) src_code = reinterpret_cast(cryptonight_v8_double_mainloop_sandybridge_asm); else - src_code = cryptonight_v8_mainloop_ivybridge_asm;; + src_code = cryptonight_v8_mainloop_ivybridge_asm; + ; } // supports only 1 thread per hash if(selected_asm == "amd_avx") @@ -1295,19 +1314,17 @@ void patchAsmVariants(std::string selected_asm, cryptonight_ctx** ctx, const xmr flushInstructionCache(ctx[0]->fun_data, allocation_size); } } -} // namespace (anonymous) - - +} // namespace struct Cryptonight_hash_gpu { static constexpr size_t N = 1; - template + template static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo) { set_float_rounding_mode_nearest(); - keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200); + keccak((const uint8_t*)input, len, ctx[0]->hash_state, 200); cn_explode_scratchpad_gpu(ctx[0]->hash_state, ctx[0]->long_state, algo); if(cngpu_check_avx2()) @@ -1321,16 +1338,15 @@ struct Cryptonight_hash_gpu } }; -template +template struct Cryptonight_R_generator { - template + template static void cn_on_new_job(const xmrstak::miner_work& work, cryptonight_ctx** ctx) { if(ctx[0]->cn_r_ctx.height == work.iBlockHeight && ctx[0]->last_algo == POW(cryptonight_r) && - reinterpret_cast(ctx[0]->hash_fn) == ctx[0]->fun_data - ) + reinterpret_cast(ctx[0]->hash_fn) == ctx[0]->fun_data) return; ctx[0]->last_algo = POW(cryptonight_r); @@ -1346,7 +1362,7 @@ struct Cryptonight_R_generator ctx[0]->hash_fn = Cryptonight_hash_asm::template hash; } - for(size_t i=1; i < N; i++) + for(size_t i = 1; i < N; i++) { ctx[i]->cn_r_ctx = ctx[0]->cn_r_ctx; ctx[i]->loop_fn = ctx[0]->loop_fn; diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp index a9d1c96fd..e35c7c7b8 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp +++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp @@ -23,19 +23,19 @@ extern "C" { -#include "c_groestl.h" #include "c_blake256.h" +#include "c_groestl.h" #include "c_jh.h" #include "c_skein.h" } -#include "xmrstak/backend/cryptonight.hpp" #include "cryptonight.h" #include "cryptonight_aesni.h" -#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/jconf.hpp" +#include "xmrstak/misc/console.hpp" +#include #include #include -#include #ifdef __GNUC__ #include @@ -49,30 +49,35 @@ extern "C" #ifdef _WIN32 #include +// this comment avoid that clang format reorders the includes #include #else -#include #include #include +#include #endif // _WIN32 -void do_blake_hash(const void* input, uint32_t len, char* output) { +void do_blake_hash(const void* input, uint32_t len, char* output) +{ blake256_hash((uint8_t*)output, (const uint8_t*)input, len); } -void do_groestl_hash(const void* input, uint32_t len, char* output) { +void do_groestl_hash(const void* input, uint32_t len, char* output) +{ groestl((const uint8_t*)input, len * 8, (uint8_t*)output); } -void do_jh_hash(const void* input, uint32_t len, char* output) { +void do_jh_hash(const void* input, uint32_t len, char* output) +{ jh_hash(32 * 8, (const uint8_t*)input, 8 * len, (uint8_t*)output); } -void do_skein_hash(const void* input, uint32_t len, char* output) { +void do_skein_hash(const void* input, uint32_t len, char* output) +{ skein_hash(8 * 32, (const uint8_t*)input, 8 * len, (uint8_t*)output); } -void (* const extra_hashes[4])(const void *, uint32_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; +void (*const extra_hashes[4])(const void*, uint32_t, char*) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; #ifdef _WIN32 #include "xmrstak/misc/uac.hpp" @@ -81,21 +86,21 @@ BOOL bRebootDesirable = FALSE; //If VirtualAlloc fails, suggest a reboot BOOL AddPrivilege(TCHAR* pszPrivilege) { - HANDLE hToken; + HANDLE hToken; TOKEN_PRIVILEGES tp; - BOOL status; + BOOL status; - if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) + if(!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) return FALSE; - if (!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) + if(!LookupPrivilegeValue(NULL, pszPrivilege, &tp.Privileges[0].Luid)) return FALSE; tp.PrivilegeCount = 1; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); - if (!status || (GetLastError() != ERROR_SUCCESS)) + if(!status || (GetLastError() != ERROR_SUCCESS)) return FALSE; CloseHandle(hToken); @@ -107,19 +112,19 @@ BOOL AddLargePageRights() HANDLE hToken; PTOKEN_USER user = NULL; - if (OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken) == TRUE) + if(OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken) == TRUE) { TOKEN_ELEVATION Elevation; DWORD cbSize = sizeof(TOKEN_ELEVATION); BOOL bIsElevated = FALSE; - if (GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize)) + if(GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize)) bIsElevated = Elevation.TokenIsElevated; DWORD size = 0; GetTokenInformation(hToken, TokenUser, NULL, 0, &size); - if (size > 0 && bIsElevated) + if(size > 0 && bIsElevated) { user = (PTOKEN_USER)LocalAlloc(LPTR, size); GetTokenInformation(hToken, TokenUser, user, size, &size); @@ -128,7 +133,7 @@ BOOL AddLargePageRights() CloseHandle(hToken); } - if (!user) + if(!user) return FALSE; LSA_HANDLE handle; @@ -136,7 +141,7 @@ BOOL AddLargePageRights() ZeroMemory(&attributes, sizeof(attributes)); BOOL result = FALSE; - if (LsaOpenPolicy(NULL, &attributes, POLICY_ALL_ACCESS, &handle) == 0) + if(LsaOpenPolicy(NULL, &attributes, POLICY_ALL_ACCESS, &handle) == 0) { LSA_UNICODE_STRING lockmem; lockmem.Buffer = L"SeLockMemoryPrivilege"; @@ -146,11 +151,11 @@ BOOL AddLargePageRights() PLSA_UNICODE_STRING rights = NULL; ULONG cnt = 0; BOOL bHasRights = FALSE; - if (LsaEnumerateAccountRights(handle, user->User.Sid, &rights, &cnt) == 0) + if(LsaEnumerateAccountRights(handle, user->User.Sid, &rights, &cnt) == 0) { - for (size_t i = 0; i < cnt; i++) + for(size_t i = 0; i < cnt; i++) { - if (rights[i].Length == lockmem.Length && + if(rights[i].Length == lockmem.Length && memcmp(rights[i].Buffer, lockmem.Buffer, 42) == 0) { bHasRights = TRUE; @@ -220,7 +225,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al ptr->ctx_info[0] = 0; ptr->ctx_info[1] = 0; if(ptr->long_state == NULL) - printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte",std::to_string(hashMemSize).c_str()); + printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: _mm_malloc was not able to allocate %s byte", std::to_string(hashMemSize).c_str()); return ptr; } @@ -250,7 +255,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al #else //http://man7.org/linux/man-pages/man2/mmap.2.html #if defined(__APPLE__) - ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, + ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); #elif defined(__FreeBSD__) ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, @@ -261,7 +266,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al #else ptr->long_state = (uint8_t*)mmap(NULL, hashMemSize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); - if (ptr->long_state == MAP_FAILED) + if(ptr->long_state == MAP_FAILED) { // try without MAP_HUGETLB for crappy kernels msg->warning = "mmap with HUGETLB failed, attempting without it (you should fix your kernel)"; @@ -270,7 +275,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al } #endif - if (ptr->long_state == MAP_FAILED) + if(ptr->long_state == MAP_FAILED) { _mm_free(ptr); msg->warning = "mmap failed, check attribute 'use_slow_memory' in 'config.txt'"; @@ -279,7 +284,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al ptr->ctx_info[0] = 1; - if(madvise(ptr->long_state, hashMemSize, MADV_RANDOM|MADV_WILLNEED) != 0) + if(madvise(ptr->long_state, hashMemSize, MADV_RANDOM | MADV_WILLNEED) != 0) msg->warning = "madvise failed"; ptr->ctx_info[1] = 0; diff --git a/xmrstak/backend/cpu/crypto/groestl_tables.h b/xmrstak/backend/cpu/crypto/groestl_tables.h index a23295c35..85dd25f3d 100644 --- a/xmrstak/backend/cpu/crypto/groestl_tables.h +++ b/xmrstak/backend/cpu/crypto/groestl_tables.h @@ -1,38 +1,6 @@ #ifndef __tables_h #define __tables_h - -const uint32_t T[512] = {0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc -, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5 -, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d -, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded -, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1 -, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441 -, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4 -, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba -, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616 -, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2 -, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c -, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de -, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7 -, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e -, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c -, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7 -, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b -, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4 -, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e -, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a -, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37 -, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86 -, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b -, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028 -, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3 -, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94 -, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836 -, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0 -, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2 -, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e -, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3 -, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e}; +const uint32_t T[512] = {0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e}; #endif /* __tables_h */ diff --git a/xmrstak/backend/cpu/crypto/hash.h b/xmrstak/backend/cpu/crypto/hash.h index 2af330932..574581376 100644 --- a/xmrstak/backend/cpu/crypto/hash.h +++ b/xmrstak/backend/cpu/crypto/hash.h @@ -4,4 +4,9 @@ typedef unsigned char BitSequence; typedef uint32_t DataLength; -typedef enum {SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2} HashReturn; +typedef enum +{ + SUCCESS = 0, + FAIL = 1, + BAD_HASHLEN = 2 +} HashReturn; diff --git a/xmrstak/backend/cpu/crypto/int-util.h b/xmrstak/backend/cpu/crypto/int-util.h index 8748976c1..393b4f3d2 100644 --- a/xmrstak/backend/cpu/crypto/int-util.h +++ b/xmrstak/backend/cpu/crypto/int-util.h @@ -12,43 +12,51 @@ #if defined(_MSC_VER) #include -static inline uint32_t rol32(uint32_t x, int r) { +static inline uint32_t rol32(uint32_t x, int r) +{ static_assert(sizeof(uint32_t) == sizeof(unsigned int), "this code assumes 32-bit integers"); return _rotl(x, r); } -static inline uint64_t rol64(uint64_t x, int r) { +static inline uint64_t rol64(uint64_t x, int r) +{ return _rotl64(x, r); } #else -static inline uint32_t rol32(uint32_t x, int r) { +static inline uint32_t rol32(uint32_t x, int r) +{ return (x << (r & 31)) | (x >> (-r & 31)); } -static inline uint64_t rol64(uint64_t x, int r) { +static inline uint64_t rol64(uint64_t x, int r) +{ return (x << (r & 63)) | (x >> (-r & 63)); } #endif -static inline uint64_t hi_dword(uint64_t val) { +static inline uint64_t hi_dword(uint64_t val) +{ return val >> 32; } -static inline uint64_t lo_dword(uint64_t val) { +static inline uint64_t lo_dword(uint64_t val) +{ return val & 0xFFFFFFFF; } -static inline uint64_t div_with_reminder(uint64_t dividend, uint32_t divisor, uint32_t* remainder) { +static inline uint64_t div_with_reminder(uint64_t dividend, uint32_t divisor, uint32_t* remainder) +{ dividend |= ((uint64_t)*remainder) << 32; *remainder = dividend % divisor; return dividend / divisor; } // Long division with 2^32 base -static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uint32_t divisor, uint64_t* quotient_hi, uint64_t* quotient_lo) { +static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uint32_t divisor, uint64_t* quotient_hi, uint64_t* quotient_lo) +{ uint64_t dividend_dwords[4]; uint32_t remainder = 0; @@ -65,30 +73,35 @@ static inline uint32_t div128_32(uint64_t dividend_hi, uint64_t dividend_lo, uin return remainder; } -#define IDENT32(x) ((uint32_t) (x)) -#define IDENT64(x) ((uint64_t) (x)) +#define IDENT32(x) ((uint32_t)(x)) +#define IDENT64(x) ((uint64_t)(x)) -#define SWAP32(x) ((((uint32_t) (x) & 0x000000ff) << 24) | \ - (((uint32_t) (x) & 0x0000ff00) << 8) | \ - (((uint32_t) (x) & 0x00ff0000) >> 8) | \ - (((uint32_t) (x) & 0xff000000) >> 24)) -#define SWAP64(x) ((((uint64_t) (x) & 0x00000000000000ff) << 56) | \ - (((uint64_t) (x) & 0x000000000000ff00) << 40) | \ - (((uint64_t) (x) & 0x0000000000ff0000) << 24) | \ - (((uint64_t) (x) & 0x00000000ff000000) << 8) | \ - (((uint64_t) (x) & 0x000000ff00000000) >> 8) | \ - (((uint64_t) (x) & 0x0000ff0000000000) >> 24) | \ - (((uint64_t) (x) & 0x00ff000000000000) >> 40) | \ - (((uint64_t) (x) & 0xff00000000000000) >> 56)) +#define SWAP32(x) ((((uint32_t)(x)&0x000000ff) << 24) | \ + (((uint32_t)(x)&0x0000ff00) << 8) | \ + (((uint32_t)(x)&0x00ff0000) >> 8) | \ + (((uint32_t)(x)&0xff000000) >> 24)) +#define SWAP64(x) ((((uint64_t)(x)&0x00000000000000ff) << 56) | \ + (((uint64_t)(x)&0x000000000000ff00) << 40) | \ + (((uint64_t)(x)&0x0000000000ff0000) << 24) | \ + (((uint64_t)(x)&0x00000000ff000000) << 8) | \ + (((uint64_t)(x)&0x000000ff00000000) >> 8) | \ + (((uint64_t)(x)&0x0000ff0000000000) >> 24) | \ + (((uint64_t)(x)&0x00ff000000000000) >> 40) | \ + (((uint64_t)(x)&0xff00000000000000) >> 56)) -static inline uint32_t ident32(uint32_t x) { return x; } +static inline uint32_t ident32(uint32_t x) +{ + return x; +} static inline uint64_t ident64(uint64_t x) { return x; } -static inline uint32_t swap32(uint32_t x) { +static inline uint32_t swap32(uint32_t x) +{ x = ((x & 0x00ff00ff) << 8) | ((x & 0xff00ff00) >> 8); return (x << 16) | (x >> 16); } -static inline uint64_t swap64(uint64_t x) { +static inline uint64_t swap64(uint64_t x) +{ x = ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8); x = ((x & 0x0000ffff0000ffff) << 16) | ((x & 0xffff0000ffff0000) >> 16); return (x << 32) | (x >> 32); @@ -99,39 +112,51 @@ static inline uint64_t swap64(uint64_t x) { #else #define UNUSED #endif -static inline void mem_inplace_ident(void *mem UNUSED, size_t n UNUSED) { } +static inline void mem_inplace_ident(void* mem UNUSED, size_t n UNUSED) +{ +} #undef UNUSED -static inline void mem_inplace_swap32(void *mem, size_t n) { +static inline void mem_inplace_swap32(void* mem, size_t n) +{ size_t i; - for (i = 0; i < n; i++) { - ((uint32_t *)mem)[i] = swap32(((const uint32_t *)mem)[i]); + for(i = 0; i < n; i++) + { + ((uint32_t*)mem)[i] = swap32(((const uint32_t*)mem)[i]); } } -static inline void mem_inplace_swap64(void *mem, size_t n) { +static inline void mem_inplace_swap64(void* mem, size_t n) +{ size_t i; - for (i = 0; i < n; i++) { - ((uint64_t *)mem)[i] = swap64(((const uint64_t *)mem)[i]); + for(i = 0; i < n; i++) + { + ((uint64_t*)mem)[i] = swap64(((const uint64_t*)mem)[i]); } } -static inline void memcpy_ident32(void *dst, const void *src, size_t n) { +static inline void memcpy_ident32(void* dst, const void* src, size_t n) +{ memcpy(dst, src, 4 * n); } -static inline void memcpy_ident64(void *dst, const void *src, size_t n) { +static inline void memcpy_ident64(void* dst, const void* src, size_t n) +{ memcpy(dst, src, 8 * n); } -static inline void memcpy_swap32(void *dst, const void *src, size_t n) { +static inline void memcpy_swap32(void* dst, const void* src, size_t n) +{ size_t i; - for (i = 0; i < n; i++) { - ((uint32_t *)dst)[i] = swap32(((const uint32_t *)src)[i]); + for(i = 0; i < n; i++) + { + ((uint32_t*)dst)[i] = swap32(((const uint32_t*)src)[i]); } } -static inline void memcpy_swap64(void *dst, const void *src, size_t n) { +static inline void memcpy_swap64(void* dst, const void* src, size_t n) +{ size_t i; - for (i = 0; i < n; i++) { - ((uint64_t *)dst)[i] = swap64(((const uint64_t *)src)[i]); + for(i = 0; i < n; i++) + { + ((uint64_t*)dst)[i] = swap64(((const uint64_t*)src)[i]); } } diff --git a/xmrstak/backend/cpu/crypto/skein_port.h b/xmrstak/backend/cpu/crypto/skein_port.h index 99641bcdf..1648cdc7d 100644 --- a/xmrstak/backend/cpu/crypto/skein_port.h +++ b/xmrstak/backend/cpu/crypto/skein_port.h @@ -2,38 +2,38 @@ #define _SKEIN_PORT_H_ #include -#include #include +#include #ifndef RETURN_VALUES -# define RETURN_VALUES -# if defined( DLL_EXPORT ) -# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) -# define VOID_RETURN __declspec( dllexport ) void __stdcall -# define INT_RETURN __declspec( dllexport ) int __stdcall -# elif defined( __GNUC__ ) -# define VOID_RETURN __declspec( __dllexport__ ) void -# define INT_RETURN __declspec( __dllexport__ ) int -# else -# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers -# endif -# elif defined( DLL_IMPORT ) -# if defined( _MSC_VER ) || defined ( __INTEL_COMPILER ) -# define VOID_RETURN __declspec( dllimport ) void __stdcall -# define INT_RETURN __declspec( dllimport ) int __stdcall -# elif defined( __GNUC__ ) -# define VOID_RETURN __declspec( __dllimport__ ) void -# define INT_RETURN __declspec( __dllimport__ ) int -# else -# error Use of the DLL is only available on the Microsoft, Intel and GCC compilers -# endif -# elif defined( __WATCOMC__ ) -# define VOID_RETURN void __cdecl -# define INT_RETURN int __cdecl -# else -# define VOID_RETURN void -# define INT_RETURN int -# endif +#define RETURN_VALUES +#if defined(DLL_EXPORT) +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) +#define VOID_RETURN __declspec(dllexport) void __stdcall +#define INT_RETURN __declspec(dllexport) int __stdcall +#elif defined(__GNUC__) +#define VOID_RETURN __declspec(__dllexport__) void +#define INT_RETURN __declspec(__dllexport__) int +#else +#error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +#endif +#elif defined(DLL_IMPORT) +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) +#define VOID_RETURN __declspec(dllimport) void __stdcall +#define INT_RETURN __declspec(dllimport) int __stdcall +#elif defined(__GNUC__) +#define VOID_RETURN __declspec(__dllimport__) void +#define INT_RETURN __declspec(__dllimport__) int +#else +#error Use of the DLL is only available on the Microsoft, Intel and GCC compilers +#endif +#elif defined(__WATCOMC__) +#define VOID_RETURN void __cdecl +#define INT_RETURN int __cdecl +#else +#define VOID_RETURN void +#define INT_RETURN int +#endif #endif /* These defines are used to declare buffers in a way that allows @@ -52,17 +52,17 @@ variable of length 'size' bits */ -#define ui_type(size) uint##size##_t -#define dec_unit_type(size,x) typedef ui_type(size) x -#define dec_bufr_type(size,bsize,x) typedef ui_type(size) x[bsize / (size >> 3)] -#define ptr_cast(x,size) ((ui_type(size)*)(x)) +#define ui_type(size) uint##size##_t +#define dec_unit_type(size, x) typedef ui_type(size) x +#define dec_bufr_type(size, bsize, x) typedef ui_type(size) x[bsize / (size >> 3)] +#define ptr_cast(x, size) ((ui_type(size)*)(x)) -typedef unsigned int uint_t; /* native unsigned integer */ -typedef uint8_t u08b_t; /* 8-bit unsigned integer */ -typedef uint64_t u64b_t; /* 64-bit unsigned integer */ +typedef unsigned int uint_t; /* native unsigned integer */ +typedef uint8_t u08b_t; /* 8-bit unsigned integer */ +typedef uint64_t u64b_t; /* 64-bit unsigned integer */ #ifndef RotL_64 -#define RotL_64(x,N) (((x) << (N)) | ((x) >> (64-(N)))) +#define RotL_64(x, N) (((x) << (N)) | ((x) >> (64 - (N)))) #endif /* @@ -91,26 +91,25 @@ typedef uint64_t u64b_t; /* 64-bit unsigned integer */ /* special handler for IA64, which may be either endianness (?) */ /* here we assume little-endian, but this may need to be changed */ #if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) -# define PLATFORM_MUST_ALIGN (1) +#define PLATFORM_MUST_ALIGN (1) #ifndef PLATFORM_BYTE_ORDER -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN #endif #endif -#ifndef PLATFORM_MUST_ALIGN -# define PLATFORM_MUST_ALIGN (0) +#ifndef PLATFORM_MUST_ALIGN +#define PLATFORM_MUST_ALIGN (0) #endif - -#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN - /* here for big-endian CPUs */ -#define SKEIN_NEED_SWAP (1) +#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN +/* here for big-endian CPUs */ +#define SKEIN_NEED_SWAP (1) #elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN - /* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */ -#define SKEIN_NEED_SWAP (0) -#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */ -#define Skein_Put64_LSB_First(dst08,src64,bCnt) memcpy(dst08,src64,bCnt) -#define Skein_Get64_LSB_First(dst64,src08,wCnt) memcpy(dst64,src08,8*(wCnt)) +/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */ +#define SKEIN_NEED_SWAP (0) +#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */ +#define Skein_Put64_LSB_First(dst08, src64, bCnt) memcpy(dst08, src64, bCnt) +#define Skein_Get64_LSB_First(dst64, src08, wCnt) memcpy(dst64, src08, 8 * (wCnt)) #endif #else #error "Skein needs endianness setting!" @@ -123,57 +122,55 @@ typedef uint64_t u64b_t; /* 64-bit unsigned integer */ * Provide any definitions still needed. ****************************************************************** */ -#ifndef Skein_Swap64 /* swap for big-endian, nop for little-endian */ -#if SKEIN_NEED_SWAP -#define Skein_Swap64(w64) \ - ( (( ((u64b_t)(w64)) & 0xFF) << 56) | \ - (((((u64b_t)(w64)) >> 8) & 0xFF) << 48) | \ - (((((u64b_t)(w64)) >>16) & 0xFF) << 40) | \ - (((((u64b_t)(w64)) >>24) & 0xFF) << 32) | \ - (((((u64b_t)(w64)) >>32) & 0xFF) << 24) | \ - (((((u64b_t)(w64)) >>40) & 0xFF) << 16) | \ - (((((u64b_t)(w64)) >>48) & 0xFF) << 8) | \ - (((((u64b_t)(w64)) >>56) & 0xFF) ) ) +#ifndef Skein_Swap64 /* swap for big-endian, nop for little-endian */ +#if SKEIN_NEED_SWAP +#define Skein_Swap64(w64) \ + (((((u64b_t)(w64)) & 0xFF) << 56) | \ + (((((u64b_t)(w64)) >> 8) & 0xFF) << 48) | \ + (((((u64b_t)(w64)) >> 16) & 0xFF) << 40) | \ + (((((u64b_t)(w64)) >> 24) & 0xFF) << 32) | \ + (((((u64b_t)(w64)) >> 32) & 0xFF) << 24) | \ + (((((u64b_t)(w64)) >> 40) & 0xFF) << 16) | \ + (((((u64b_t)(w64)) >> 48) & 0xFF) << 8) | \ + (((((u64b_t)(w64)) >> 56) & 0xFF))) #else -#define Skein_Swap64(w64) (w64) +#define Skein_Swap64(w64) (w64) #endif -#endif /* ifndef Skein_Swap64 */ - +#endif /* ifndef Skein_Swap64 */ #ifndef Skein_Put64_LSB_First -void Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt) -#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ - { /* this version is fully portable (big-endian or little-endian), but slow */ +void Skein_Put64_LSB_First(u08b_t* dst, const u64b_t* src, size_t bCnt) +#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ +{ /* this version is fully portable (big-endian or little-endian), but slow */ size_t n; - for (n=0;n>3] >> (8*(n&7))); - } + for(n = 0; n < bCnt; n++) + dst[n] = (u08b_t)(src[n >> 3] >> (8 * (n & 7))); +} #else - ; /* output only the function prototype */ + ; /* output only the function prototype */ #endif -#endif /* ifndef Skein_Put64_LSB_First */ - +#endif /* ifndef Skein_Put64_LSB_First */ #ifndef Skein_Get64_LSB_First -void Skein_Get64_LSB_First(u64b_t *dst,const u08b_t *src,size_t wCnt) -#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ - { /* this version is fully portable (big-endian or little-endian), but slow */ +void Skein_Get64_LSB_First(u64b_t* dst, const u08b_t* src, size_t wCnt) +#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ +{ /* this version is fully portable (big-endian or little-endian), but slow */ size_t n; - for (n=0;n<8*wCnt;n+=8) - dst[n/8] = (((u64b_t) src[n ]) ) + - (((u64b_t) src[n+1]) << 8) + - (((u64b_t) src[n+2]) << 16) + - (((u64b_t) src[n+3]) << 24) + - (((u64b_t) src[n+4]) << 32) + - (((u64b_t) src[n+5]) << 40) + - (((u64b_t) src[n+6]) << 48) + - (((u64b_t) src[n+7]) << 56) ; - } + for(n = 0; n < 8 * wCnt; n += 8) + dst[n / 8] = (((u64b_t)src[n])) + + (((u64b_t)src[n + 1]) << 8) + + (((u64b_t)src[n + 2]) << 16) + + (((u64b_t)src[n + 3]) << 24) + + (((u64b_t)src[n + 4]) << 32) + + (((u64b_t)src[n + 5]) << 40) + + (((u64b_t)src[n + 6]) << 48) + + (((u64b_t)src[n + 7]) << 56); +} #else - ; /* output only the function prototype */ + ; /* output only the function prototype */ #endif -#endif /* ifndef Skein_Get64_LSB_First */ +#endif /* ifndef Skein_Get64_LSB_First */ -#endif /* ifndef _SKEIN_PORT_H_ */ +#endif /* ifndef _SKEIN_PORT_H_ */ diff --git a/xmrstak/backend/cpu/crypto/soft_aes.hpp b/xmrstak/backend/cpu/crypto/soft_aes.hpp index 9b4ae0ab5..3ea75c5e6 100644 --- a/xmrstak/backend/cpu/crypto/soft_aes.hpp +++ b/xmrstak/backend/cpu/crypto/soft_aes.hpp @@ -34,56 +34,58 @@ #include -#define saes_data(w) {\ - w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5),\ - w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76),\ - w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0),\ - w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0),\ - w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc),\ - w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15),\ - w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a),\ - w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75),\ - w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0),\ - w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84),\ - w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b),\ - w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf),\ - w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85),\ - w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8),\ - w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5),\ - w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2),\ - w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17),\ - w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73),\ - w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88),\ - w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb),\ - w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c),\ - w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79),\ - w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9),\ - w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08),\ - w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6),\ - w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a),\ - w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e),\ - w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e),\ - w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94),\ - w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf),\ - w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68),\ - w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) } +#define saes_data(w) \ + { \ + w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), w(0xc5), \ + w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), w(0xab), w(0x76), \ + w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), w(0x59), w(0x47), w(0xf0), \ + w(0xad), w(0xd4), w(0xa2), w(0xaf), w(0x9c), w(0xa4), w(0x72), w(0xc0), \ + w(0xb7), w(0xfd), w(0x93), w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), \ + w(0x34), w(0xa5), w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), \ + w(0x04), w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ + w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), w(0x75), \ + w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), w(0x5a), w(0xa0), \ + w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), w(0xe3), w(0x2f), w(0x84), \ + w(0x53), w(0xd1), w(0x00), w(0xed), w(0x20), w(0xfc), w(0xb1), w(0x5b), \ + w(0x6a), w(0xcb), w(0xbe), w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), \ + w(0xd0), w(0xef), w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), \ + w(0x45), w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ + w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), w(0xf5), \ + w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), w(0xf3), w(0xd2), \ + w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), w(0x97), w(0x44), w(0x17), \ + w(0xc4), w(0xa7), w(0x7e), w(0x3d), w(0x64), w(0x5d), w(0x19), w(0x73), \ + w(0x60), w(0x81), w(0x4f), w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), \ + w(0x46), w(0xee), w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), \ + w(0xe0), w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ + w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), w(0x79), \ + w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), w(0x4e), w(0xa9), \ + w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), w(0x7a), w(0xae), w(0x08), \ + w(0xba), w(0x78), w(0x25), w(0x2e), w(0x1c), w(0xa6), w(0xb4), w(0xc6), \ + w(0xe8), w(0xdd), w(0x74), w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), \ + w(0x70), w(0x3e), w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), \ + w(0x61), w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ + w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), w(0x94), \ + w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), w(0x28), w(0xdf), \ + w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), w(0xe6), w(0x42), w(0x68), \ + w(0x41), w(0x99), w(0x2d), w(0x0f), w(0xb0), w(0x54), w(0xbb), w(0x16) \ + } -#define SAES_WPOLY 0x011b +#define SAES_WPOLY 0x011b #define saes_b2w(b0, b1, b2, b3) (((uint32_t)(b3) << 24) | \ - ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0)) + ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | (b0)) -#define saes_f2(x) ((x<<1) ^ (((x>>7) & 1) * SAES_WPOLY)) -#define saes_f3(x) (saes_f2(x) ^ x) -#define saes_h0(x) (x) +#define saes_f2(x) ((x << 1) ^ (((x >> 7) & 1) * SAES_WPOLY)) +#define saes_f3(x) (saes_f2(x) ^ x) +#define saes_h0(x) (x) -#define saes_u0(p) saes_b2w(saes_f2(p), p, p, saes_f3(p)) -#define saes_u1(p) saes_b2w(saes_f3(p), saes_f2(p), p, p) -#define saes_u2(p) saes_b2w( p, saes_f3(p), saes_f2(p), p) -#define saes_u3(p) saes_b2w( p, p, saes_f3(p), saes_f2(p)) +#define saes_u0(p) saes_b2w(saes_f2(p), p, p, saes_f3(p)) +#define saes_u1(p) saes_b2w(saes_f3(p), saes_f2(p), p, p) +#define saes_u2(p) saes_b2w(p, saes_f3(p), saes_f2(p), p) +#define saes_u3(p) saes_b2w(p, p, saes_f3(p), saes_f2(p)) -alignas(16) const uint32_t saes_table[4][256] = { saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3) }; -alignas(16) const uint8_t saes_sbox[256] = saes_data(saes_h0); +alignas(16) const uint32_t saes_table[4][256] = {saes_data(saes_u0), saes_data(saes_u1), saes_data(saes_u2), saes_data(saes_u3)}; +alignas(16) const uint8_t saes_sbox[256] = saes_data(saes_h0); static inline __m128i soft_aesenc(__m128i in, __m128i key) { @@ -104,10 +106,10 @@ static inline __m128i soft_aesenc(__m128i in, __m128i key) static inline uint32_t sub_word(uint32_t key) { - return (saes_sbox[key >> 24 ] << 24) | - (saes_sbox[(key >> 16) & 0xff] << 16 ) | - (saes_sbox[(key >> 8) & 0xff] << 8 ) | - saes_sbox[key & 0xff]; + return (saes_sbox[key >> 24] << 24) | + (saes_sbox[(key >> 16) & 0xff] << 16) | + (saes_sbox[(key >> 8) & 0xff] << 8) | + saes_sbox[key & 0xff]; } #ifdef __clang__ @@ -121,5 +123,5 @@ static inline __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon) { uint32_t X1 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55))); uint32_t X3 = sub_word(_mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF))); - return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3,_rotr(X1, 8) ^ rcon, X1); + return _mm_set_epi32(_rotr(X3, 8) ^ rcon, X3, _rotr(X1, 8) ^ rcon, X1); } diff --git a/xmrstak/backend/cpu/crypto/variant4_random_math.h b/xmrstak/backend/cpu/crypto/variant4_random_math.h index 50228adf2..9fe61db51 100644 --- a/xmrstak/backend/cpu/crypto/variant4_random_math.h +++ b/xmrstak/backend/cpu/crypto/variant4_random_math.h @@ -1,12 +1,12 @@ #pragma once -#include #include "../../cryptonight.hpp" #include "xmrstak/misc/console.hpp" +#include extern "C" { - #include "c_blake256.h" +#include "c_blake256.h" } enum V4_Settings @@ -31,13 +31,13 @@ enum V4_Settings enum V4_InstructionList { - MUL, // a*b - ADD, // a+b + C, C is an unsigned 32-bit constant - SUB, // a-b - ROR, // rotate right "a" by "b & 31" bits - ROL, // rotate left "a" by "b & 31" bits - XOR, // a^b - RET, // finish execution + MUL, // a*b + ADD, // a+b + C, C is an unsigned 32-bit constant + SUB, // a-b + ROR, // rotate right "a" by "b & 31" bits + ROL, // rotate left "a" by "b & 31" bits + XOR, // a^b + RET, // finish execution V4_INSTRUCTION_COUNT = RET, }; @@ -87,7 +87,7 @@ struct V4_Instruction // every switch-case will point to the same destination on every iteration of Cryptonight main loop // // This is about as fast as it can get without using low-level machine code generation -template +template static void v4_random_math(const struct V4_Instruction* code, v4_reg* r) { enum @@ -95,55 +95,55 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r) REG_BITS = sizeof(v4_reg) * 8, }; -#define V4_EXEC(i) \ - { \ - const struct V4_Instruction* op = code + i; \ - const v4_reg src = r[op->src_index]; \ - v4_reg* dst = r + op->dst_index; \ - switch (op->opcode) \ - { \ - case MUL: \ - *dst *= src; \ - break; \ - case ADD: \ - *dst += src + op->C; \ - break; \ - case SUB: \ - *dst -= src; \ - break; \ - case ROR: \ - { \ - const uint32_t shift = src % REG_BITS; \ - *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \ - } \ - break; \ - case ROL: \ - { \ - const uint32_t shift = src % REG_BITS; \ - *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \ - } \ - break; \ - case XOR: \ - *dst ^= src; \ - break; \ - case RET: \ - return; \ - default: \ - UNREACHABLE_CODE; \ - break; \ - } \ +#define V4_EXEC(i) \ + { \ + const struct V4_Instruction* op = code + i; \ + const v4_reg src = r[op->src_index]; \ + v4_reg* dst = r + op->dst_index; \ + switch(op->opcode) \ + { \ + case MUL: \ + *dst *= src; \ + break; \ + case ADD: \ + *dst += src + op->C; \ + break; \ + case SUB: \ + *dst -= src; \ + break; \ + case ROR: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case ROL: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case XOR: \ + *dst ^= src; \ + break; \ + case RET: \ + return; \ + default: \ + UNREACHABLE_CODE; \ + break; \ + } \ } #define V4_EXEC_10(j) \ - V4_EXEC(j + 0) \ - V4_EXEC(j + 1) \ - V4_EXEC(j + 2) \ - V4_EXEC(j + 3) \ - V4_EXEC(j + 4) \ - V4_EXEC(j + 5) \ - V4_EXEC(j + 6) \ - V4_EXEC(j + 7) \ - V4_EXEC(j + 8) \ + V4_EXEC(j + 0) \ + V4_EXEC(j + 1) \ + V4_EXEC(j + 2) \ + V4_EXEC(j + 3) \ + V4_EXEC(j + 4) \ + V4_EXEC(j + 5) \ + V4_EXEC(j + 6) \ + V4_EXEC(j + 7) \ + V4_EXEC(j + 8) \ V4_EXEC(j + 9) // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency @@ -161,13 +161,13 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r) // 69 102 // Unroll 70 instructions here - V4_EXEC_10(0); // instructions 0-9 - V4_EXEC_10(10); // instructions 10-19 - V4_EXEC_10(20); // instructions 20-29 - V4_EXEC_10(30); // instructions 30-39 - V4_EXEC_10(40); // instructions 40-49 - V4_EXEC_10(50); // instructions 50-59 - V4_EXEC_10(60); // instructions 60-69 + V4_EXEC_10(0); // instructions 0-9 + V4_EXEC_10(10); // instructions 10-19 + V4_EXEC_10(20); // instructions 20-29 + V4_EXEC_10(30); // instructions 30-39 + V4_EXEC_10(40); // instructions 40-49 + V4_EXEC_10(50); // instructions 50-59 + V4_EXEC_10(60); // instructions 60-69 #undef V4_EXEC_10 #undef V4_EXEC @@ -176,7 +176,7 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r) // If we don't have enough data available, generate more static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size) { - if (*data_index + bytes_needed > data_size) + if(*data_index + bytes_needed > data_size) { blake256_hash((uint8_t*)data, (uint8_t*)data, data_size); *data_index = 0; @@ -188,7 +188,7 @@ static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed // Generates as many random math operations as possible with given latency and ALU restrictions // "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions -template +template static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height) { printer::inst()->print_msg(LDEBUG, "CryptonightR create random math for block %llu", height); @@ -199,13 +199,13 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same // Source: https://www.agner.org/optimize/instruction_tables.pdf - const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 }; + const int op_latency[V4_INSTRUCTION_COUNT] = {3, 2, 1, 2, 2, 1}; // Instruction latencies for theoretical ASIC implementation - const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 }; + const int asic_op_latency[V4_INSTRUCTION_COUNT] = {3, 1, 1, 1, 1, 1}; // Available ALUs for each instruction - const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT }; + const int op_ALUs[V4_INSTRUCTION_COUNT] = {ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT}; int8_t data[32]; memset(data, 0, sizeof(data)); @@ -226,7 +226,8 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // There is a small chance (1.8%) that register R8 won't be used in the generated program // So we keep track of it and try again if it's not used bool r8_used; - do { + do + { int latency[9]; int asic_latency[9]; @@ -237,7 +238,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // // Registers R4-R8 are constant and are treated as having the same value because when we do // the same operation twice with two constant source registers, it can be optimized into a single operation - uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; + uint32_t inst_data[9] = {0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF}; bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT]; bool is_rotation[V4_INSTRUCTION_COUNT]; @@ -260,11 +261,11 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // Generate random code to achieve minimal required latency for our abstract CPU // Try to get this latency for all 4 registers - while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64)) + while(((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64)) { // Fail-safe to guarantee loop termination ++total_iterations; - if (total_iterations > 256) + if(total_iterations > 256) break; check_data(&data_index, 1, data, sizeof(data)); @@ -277,12 +278,12 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // ROR/ROL = opcode 5, shift direction is selected randomly // XOR = opcodes 6-7 uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1); - if (opcode == 5) + if(opcode == 5) { check_data(&data_index, 1, data, sizeof(data)); opcode = (data[data_index++] >= 0) ? ROR : ROL; } - else if (opcode >= 6) + else if(opcode >= 6) { opcode = XOR; } @@ -298,7 +299,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh int b = src_index; // Don't do ADD/SUB/XOR with the same register - if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) + if(((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) { // a is always < 4, so we don't need to check bounds here b = (ALGO == cryptonight_r_wow) ? (a + 4) : 8; @@ -306,7 +307,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh } // Don't do rotation with the same destination twice because it's equal to a single rotation - if (is_rotation[opcode] && rotated[a]) + if(is_rotation[opcode] && rotated[a]) { continue; } @@ -314,7 +315,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized: // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations // 2xXOR(a, b) = NOP - if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16))) + if((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16))) { continue; } @@ -322,20 +323,20 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // Find which ALU is available (and when) for this instruction int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b]; int alu_index = -1; - while (next_latency < TOTAL_LATENCY) + while(next_latency < TOTAL_LATENCY) { - for (int i = op_ALUs[opcode] - 1; i >= 0; --i) + for(int i = op_ALUs[opcode] - 1; i >= 0; --i) { - if (!alu_busy[next_latency][i]) + if(!alu_busy[next_latency][i]) { // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check - if ((opcode == ADD) && alu_busy[next_latency + 1][i]) + if((opcode == ADD) && alu_busy[next_latency + 1][i]) { continue; } // Rotation can only start when previous rotation is finished, so do an additional availability check - if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode])) + if(is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode])) { continue; } @@ -344,7 +345,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh break; } } - if (alu_index >= 0) + if(alu_index >= 0) { break; } @@ -352,16 +353,16 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh } // Don't generate instructions that leave some register unchanged for more than 7 cycles - if (next_latency > latency[a] + 7) + if(next_latency > latency[a] + 7) { continue; } next_latency += op_latency[opcode]; - if (next_latency <= TOTAL_LATENCY) + if(next_latency <= TOTAL_LATENCY) { - if (is_rotation[opcode]) + if(is_rotation[opcode]) { ++rotate_count; } @@ -382,12 +383,12 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh code[code_size].src_index = src_index; code[code_size].C = 0; - if (src_index == 8) + if(src_index == 8) { r8_used = true; } - if (opcode == ADD) + if(opcode == ADD) { // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true; @@ -401,7 +402,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh } ++code_size; - if (code_size >= NUM_INSTRUCTIONS_MIN) + if(code_size >= NUM_INSTRUCTIONS_MIN) { break; } @@ -416,17 +417,19 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC // Get this latency for at least 1 of the 4 registers const int prev_code_size = code_size; - while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) + while((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) { int min_idx = 0; int max_idx = 0; - for (int i = 1; i < 4; ++i) + for(int i = 1; i < 4; ++i) { - if (asic_latency[i] < asic_latency[min_idx]) min_idx = i; - if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; + if(asic_latency[i] < asic_latency[min_idx]) + min_idx = i; + if(asic_latency[i] > asic_latency[max_idx]) + max_idx = i; } - const uint8_t pattern[3] = { ROR, MUL, MUL }; + const uint8_t pattern[3] = {ROR, MUL, MUL}; const uint8_t opcode = pattern[(code_size - prev_code_size) % 3]; latency[min_idx] = latency[max_idx] + op_latency[opcode]; asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode]; @@ -438,9 +441,9 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh ++code_size; } - // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time - // It never does more than 4 iterations for all block heights < 10,000,000 - } while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); + // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time + // It never does more than 4 iterations for all block heights < 10,000,000 + } while(!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here // Add final instruction to stop the interpreter diff --git a/xmrstak/backend/cpu/hwlocMemory.cpp b/xmrstak/backend/cpu/hwlocMemory.cpp index 089570fc0..804edc55d 100644 --- a/xmrstak/backend/cpu/hwlocMemory.cpp +++ b/xmrstak/backend/cpu/hwlocMemory.cpp @@ -13,7 +13,7 @@ * * @param puId core id */ -void bindMemoryToNUMANode( size_t puId ) +void bindMemoryToNUMANode(size_t puId) { int depth; hwloc_topology_t topology; @@ -30,18 +30,18 @@ void bindMemoryToNUMANode( size_t puId ) depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU); - for( uint32_t i = 0; + for(uint32_t i = 0; i < hwloc_get_nbobjs_by_depth(topology, depth); - i++ ) + i++) { hwloc_obj_t pu = hwloc_get_obj_by_depth(topology, depth, i); - if( pu->os_index == puId ) + if(pu->os_index == puId) { - if( 0 > hwloc_set_membind_nodeset( - topology, - pu->nodeset, - HWLOC_MEMBIND_BIND, - HWLOC_MEMBIND_THREAD)) + if(0 > hwloc_set_membind_nodeset( + topology, + pu->nodeset, + HWLOC_MEMBIND_BIND, + HWLOC_MEMBIND_THREAD)) { printer::inst()->print_msg(L0, "hwloc: can't bind memory"); } @@ -57,7 +57,7 @@ void bindMemoryToNUMANode( size_t puId ) } #else -void bindMemoryToNUMANode( size_t ) +void bindMemoryToNUMANode(size_t) { } diff --git a/xmrstak/backend/cpu/hwlocMemory.hpp b/xmrstak/backend/cpu/hwlocMemory.hpp index 2130c2ced..42fa3456f 100644 --- a/xmrstak/backend/cpu/hwlocMemory.hpp +++ b/xmrstak/backend/cpu/hwlocMemory.hpp @@ -9,4 +9,4 @@ * * @param puId core id */ -void bindMemoryToNUMANode( size_t puId ); +void bindMemoryToNUMANode(size_t puId); diff --git a/xmrstak/backend/cpu/jconf.cpp b/xmrstak/backend/cpu/jconf.cpp index a14be1732..a7bb91d61 100644 --- a/xmrstak/backend/cpu/jconf.cpp +++ b/xmrstak/backend/cpu/jconf.cpp @@ -37,7 +37,6 @@ #include #endif - namespace xmrstak { namespace cpu @@ -48,9 +47,14 @@ using namespace rapidjson; /* * This enum needs to match index in oConfigValues, otherwise we will get a runtime error */ -enum configEnum { aCpuThreadsConf, sUseSlowMem }; +enum configEnum +{ + aCpuThreadsConf, + sUseSlowMem +}; -struct configVal { +struct configVal +{ configEnum iName; const char* sName; Type iType; @@ -59,10 +63,9 @@ struct configVal { // Same order as in configEnum, as per comment above // kNullType means any type configVal oConfigValues[] = { - { aCpuThreadsConf, "cpu_threads_conf", kNullType } -}; + {aCpuThreadsConf, "cpu_threads_conf", kNullType}}; -constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); +constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0])); inline bool checkType(Type have, Type want) { @@ -95,7 +98,7 @@ jconf::jconf() prv = new opaque_private(); } -bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) +bool jconf::GetThreadConfig(size_t id, thd_cfg& cfg) { if(!prv->configValues[aCpuThreadsConf]->IsArray()) return false; @@ -148,7 +151,6 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) return true; } - size_t jconf::GetThreadCount() { if(prv->configValues[aCpuThreadsConf]->IsArray()) @@ -159,22 +161,22 @@ size_t jconf::GetThreadCount() bool jconf::parse_config(const char* sFilename) { - FILE * pFile; - char * buffer; + FILE* pFile; + char* buffer; size_t flen; pFile = fopen(sFilename, "rb"); - if (pFile == NULL) + if(pFile == NULL) { printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename); return false; } - fseek(pFile,0,SEEK_END); + fseek(pFile, 0, SEEK_END); flen = ftell(pFile); rewind(pFile); - if(flen >= 64*1024) + if(flen >= 64 * 1024) { fclose(pFile); printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename); @@ -189,7 +191,7 @@ bool jconf::parse_config(const char* sFilename) } buffer = (char*)malloc(flen + 3); - if(fread(buffer+1, flen, 1, pFile) != 1) + if(fread(buffer + 1, flen, 1, pFile) != 1) { free(buffer); fclose(pFile); @@ -211,7 +213,7 @@ bool jconf::parse_config(const char* sFilename) buffer[flen] = '}'; buffer[flen + 1] = '\0'; - prv->jsonDoc.Parse(buffer, flen+2); + prv->jsonDoc.Parse(buffer, flen + 2); free(buffer); if(prv->jsonDoc.HasParseError()) @@ -251,7 +253,7 @@ bool jconf::parse_config(const char* sFilename) } thd_cfg c; - for(size_t i=0; i < GetThreadCount(); i++) + for(size_t i = 0; i < GetThreadCount(); i++) { if(!GetThreadConfig(i, c)) { diff --git a/xmrstak/backend/cpu/jconf.hpp b/xmrstak/backend/cpu/jconf.hpp index 4ec9165d5..67dbd0275 100644 --- a/xmrstak/backend/cpu/jconf.hpp +++ b/xmrstak/backend/cpu/jconf.hpp @@ -12,16 +12,18 @@ namespace cpu class jconf { -public: + public: static jconf* inst() { - if (oInst == nullptr) oInst = new jconf; + if(oInst == nullptr) + oInst = new jconf; return oInst; }; bool parse_config(const char* sFilename = params::inst().configFileCPU.c_str()); - struct thd_cfg { + struct thd_cfg + { int iMultiway; bool bNoPrefetch; std::string asm_version_str; @@ -29,10 +31,10 @@ class jconf }; size_t GetThreadCount(); - bool GetThreadConfig(size_t id, thd_cfg &cfg); + bool GetThreadConfig(size_t id, thd_cfg& cfg); bool NeedsAutoconf(); -private: + private: jconf(); static jconf* oInst; diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index e90b59500..463be1aab 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -23,33 +23,33 @@ #include "crypto/cryptonight_aesni.h" -#include "xmrstak/misc/console.hpp" -#include "xmrstak/backend/iBackend.hpp" +#include "jconf.hpp" +#include "xmrstak/backend/cpu/cpuType.hpp" #include "xmrstak/backend/globalStates.hpp" +#include "xmrstak/backend/iBackend.hpp" #include "xmrstak/misc/configEditor.hpp" -#include "xmrstak/backend/cpu/cpuType.hpp" +#include "xmrstak/misc/console.hpp" #include "xmrstak/params.hpp" -#include "jconf.hpp" -#include "xmrstak/misc/executor.hpp" #include "minethd.hpp" #include "xmrstak/jconf.hpp" +#include "xmrstak/misc/executor.hpp" #include "hwlocMemory.hpp" #include "xmrstak/backend/miner_work.hpp" #ifndef CONF_NO_HWLOC -# include "autoAdjustHwloc.hpp" +#include "autoAdjustHwloc.hpp" #else -# include "autoAdjust.hpp" +#include "autoAdjust.hpp" #endif #include -#include +#include #include +#include #include #include -#include #include #ifdef _WIN32 @@ -58,9 +58,9 @@ #include #if defined(__APPLE__) -#include #include -#define SYSCTL_CORE_COUNT "machdep.cpu.core_count" +#include +#define SYSCTL_CORE_COUNT "machdep.cpu.core_count" #elif defined(__FreeBSD__) #include #endif //__APPLE__ @@ -87,7 +87,7 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id } #elif defined(__APPLE__) thread_port_t mach_thread; - thread_affinity_policy_data_t policy = { static_cast(cpu_id) }; + thread_affinity_policy_data_t policy = {static_cast(cpu_id)}; mach_thread = pthread_mach_thread_np(h); return thread_policy_set(mach_thread, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, 1) == KERN_SUCCESS; #elif defined(__FreeBSD__) @@ -96,8 +96,8 @@ bool minethd::thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id CPU_SET(cpu_id, &mn); return pthread_setaffinity_np(h, sizeof(cpuset_t), &mn) == 0; #elif defined(__OpenBSD__) - printer::inst()->print_msg(L0,"WARNING: thread pinning is not supported under OPENBSD."); - return true; + printer::inst()->print_msg(L0, "WARNING: thread pinning is not supported under OPENBSD."); + return true; #else cpu_set_t mn; CPU_ZERO(&mn); @@ -120,7 +120,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, std::unique_lock lck(thd_aff_set); std::future order_guard = order_fix.get_future(); - switch (iMultiway) + switch(iMultiway) { case 5: oWorkThd = std::thread(&minethd::penta_work_main, this); @@ -150,13 +150,13 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, cryptonight_ctx* minethd::minethd_alloc_ctx() { cryptonight_ctx* ctx; - alloc_msg msg = { 0 }; + alloc_msg msg = {0}; - switch (::jconf::inst()->GetSlowMemSetting()) + switch(::jconf::inst()->GetSlowMemSetting()) { case ::jconf::never_use: ctx = cryptonight_alloc_ctx(1, 1, &msg); - if (ctx == NULL) + if(ctx == NULL) printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning); else { @@ -170,7 +170,7 @@ cryptonight_ctx* minethd::minethd_alloc_ctx() case ::jconf::no_mlck: ctx = cryptonight_alloc_ctx(1, 0, &msg); - if (ctx == NULL) + if(ctx == NULL) printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning); else { @@ -184,12 +184,12 @@ cryptonight_ctx* minethd::minethd_alloc_ctx() case ::jconf::print_warning: ctx = cryptonight_alloc_ctx(1, 1, &msg); - if (msg.warning != NULL) + if(msg.warning != NULL) printer::inst()->print_msg(L0, "MEMORY ALLOC FAILED: %s", msg.warning); - if (ctx == NULL) + if(ctx == NULL) ctx = cryptonight_alloc_ctx(0, 0, NULL); - if (ctx != NULL) + if(ctx != NULL) { ctx->hash_fn = nullptr; ctx->loop_fn = nullptr; @@ -220,11 +220,11 @@ cryptonight_ctx* minethd::minethd_alloc_ctx() static constexpr size_t MAX_N = 5; bool minethd::self_test() { - alloc_msg msg = { 0 }; + alloc_msg msg = {0}; size_t res; bool fatal = false; - switch (::jconf::inst()->GetSlowMemSetting()) + switch(::jconf::inst()->GetSlowMemSetting()) { case ::jconf::never_use: res = cryptonight_init(1, 1, &msg); @@ -255,13 +255,13 @@ bool minethd::self_test() if(res == 0 && fatal) return false; - cryptonight_ctx *ctx[MAX_N] = {0}; - for (int i = 0; i < MAX_N; i++) + cryptonight_ctx* ctx[MAX_N] = {0}; + for(int i = 0; i < MAX_N; i++) { - if ((ctx[i] = minethd_alloc_ctx()) == nullptr) + if((ctx[i] = minethd_alloc_ctx()) == nullptr) { printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory."); - for (int j = 0; j < i; j++) + for(int j = 0; j < i; j++) cryptonight_free_ctx(ctx[j]); return false; } @@ -279,63 +279,68 @@ bool minethd::self_test() { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test", 14, out, ctx, algo); - bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; minethd::cn_on_new_job dm; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test", 14, out, ctx, algo); - bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; func_multi_selector<2>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo); - bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" - "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", + 64) == 0; func_multi_selector<2>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo); - bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" - "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; + bResult = bResult && memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" + "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", + 64) == 0; func_multi_selector<3>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a testThis is a testThis is a test", 14, out, ctx, algo); - bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0; + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", + 96) == 0; func_multi_selector<4>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo); - bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0; + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", + 128) == 0; func_multi_selector<5>(ctx, dm, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo); - bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" - "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0; + bResult = bResult && memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05" + "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", + 160) == 0; } else if(algo == POW(cryptonight_lite)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; + bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; + bResult = bResult && memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0; } else if(algo == POW(cryptonight_monero)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; + bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; + bResult = bResult && memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0; } else if(algo == POW(cryptonight_monero_v8)) { @@ -351,61 +356,61 @@ bool minethd::self_test() { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; + bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; + bResult = bResult && memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0; } else if(algo == POW(cryptonight_ipbc)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0; + bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0; + bResult = bResult && memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0; } else if(algo == POW(cryptonight_stellite)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; + bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; + bResult = bResult && memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0; } else if(algo == POW(cryptonight_masari)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; + bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; + bResult = bResult && memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0; } else if(algo == POW(cryptonight_heavy)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; + bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; + bResult = bResult && memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0; } else if(algo == POW(cryptonight_haven)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; + bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; + bResult = bResult && memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0; } else if(algo == POW(cryptonight_bittube2)) { @@ -415,7 +420,7 @@ bool minethd::self_test() func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx, algo); - bResult = bResult && memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0; + bResult = bResult && memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0; ctx[0]->hash_fn("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx, algo); bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0; @@ -427,29 +432,29 @@ bool minethd::self_test() { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx, algo); - bResult = bResult && memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0; + bResult = bResult && memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0; } else if(algo == POW(cryptonight_gpu)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("", 0, out, ctx, algo); - bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; + bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("", 0, out, ctx, algo); - bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; + bResult = bResult && memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0; } else if(algo == POW(cryptonight_conceal)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("", 0, out, ctx, algo); - bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0; + bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0; func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), true, algo); ctx[0]->hash_fn("", 0, out, ctx, algo); - bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0; + bResult = bResult && memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0; } - else if (algo == POW(cryptonight_turtle)) + else if(algo == POW(cryptonight_turtle)) { func_selector(ctx, ::jconf::inst()->HaveHardwareAes(), false, algo); ctx[0]->hash_fn("This is a test This is a test This is a test", 44, out, ctx, algo); @@ -467,7 +472,7 @@ bool minethd::self_test() work.iBlockHeight = 1806260; set_job(work, ctx); ctx[0]->hash_fn("\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74", 44, out, ctx, algo); - bResult = bResult && memcmp(out, "\xf7\x59\x58\x8a\xd5\x7e\x75\x84\x67\x29\x54\x43\xa9\xbd\x71\x49\x0a\xbf\xf8\xe9\xda\xd1\xb9\x5b\x6b\xf2\xf5\xd0\xd7\x83\x87\xbc", 32) == 0; + bResult = bResult && memcmp(out, "\xf7\x59\x58\x8a\xd5\x7e\x75\x84\x67\x29\x54\x43\xa9\xbd\x71\x49\x0a\xbf\xf8\xe9\xda\xd1\xb9\x5b\x6b\xf2\xf5\xd0\xd7\x83\x87\xbc", 32) == 0; } else if(algo == POW(cryptonight_v8_reversewaltz)) { @@ -498,7 +503,7 @@ bool minethd::self_test() "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations."); } - for (int i = 0; i < MAX_N; i++) + for(int i = 0; i < MAX_N; i++) cryptonight_free_ctx(ctx[i]); return bResult; @@ -520,14 +525,13 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work win_exit(); } - //Launch the requested number of single and double threads, to distribute //load evenly we need to alternate single and double threads size_t i, n = jconf::inst()->GetThreadCount(); pvThreads.reserve(n); jconf::thd_cfg cfg; - for (i = 0; i < n; i++) + for(i = 0; i < n; i++) { jconf::inst()->GetThreadConfig(i, cfg); @@ -572,11 +576,11 @@ static std::string getAsmName(const uint32_t num_hashes) return asm_type; } -template +template void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job& on_new_job, bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str) { - static_assert(N >= 1, "number of threads must be >= 1" ); + static_assert(N >= 1, "number of threads must be >= 1"); // We have two independent flag bits in the functions // therefore we will build a binary digit and select the @@ -717,21 +721,20 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job& Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, - Cryptonight_hash::template hash - }; + Cryptonight_hash::template hash}; std::bitset<2> digit; digit.set(0, !bHaveAes); digit.set(1, !bNoPrefetch); - ctx[0]->hash_fn = func_table[ algv << 2 | digit.to_ulong() ]; + ctx[0]->hash_fn = func_table[algv << 2 | digit.to_ulong()]; // check for asm optimized version for cryptonight_v8 if(algo == cryptonight_monero_v8) { std::string selected_asm = asm_version_str; if(selected_asm == "auto") - selected_asm = cpu::getAsmName(N); + selected_asm = cpu::getAsmName(N); if(selected_asm != "off") { @@ -747,7 +750,7 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job& { std::string selected_asm = asm_version_str; if(selected_asm == "auto") - selected_asm = cpu::getAsmName(N); + selected_asm = cpu::getAsmName(N); if(selected_asm == "off") { for(int h = 0; h < N; ++h) @@ -769,7 +772,7 @@ void minethd::func_multi_selector(cryptonight_ctx** ctx, minethd::cn_on_new_job& }; auto it = on_new_job_map.find(algo.Id()); - if (it != on_new_job_map.end()) + if(it != on_new_job_map.end()) on_new_job = it->second; else on_new_job = nullptr; @@ -806,18 +809,18 @@ void minethd::penta_work_main() multiway_work_main<5u>(); } -template -void minethd::prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce) +template +void minethd::prep_multiway_work(uint8_t* bWorkBlob, uint32_t** piNonce) { - for (size_t i = 0; i < N; i++) + for(size_t i = 0; i < N; i++) { memcpy(bWorkBlob + oWork.iWorkSize * i, oWork.bWorkBlob, oWork.iWorkSize); - if (i > 0) + if(i > 0) piNonce[i] = (uint32_t*)(bWorkBlob + oWork.iWorkSize * i + 39); } } -template +template void minethd::multiway_work_main() { if(affinity >= 0) //-1 means no affinity @@ -828,22 +831,23 @@ void minethd::multiway_work_main() lck.release(); std::this_thread::yield(); - cryptonight_ctx *ctx[MAX_N]; + cryptonight_ctx* ctx[MAX_N]; uint64_t iCount = 0; - uint64_t *piHashVal[MAX_N]; - uint32_t *piNonce[MAX_N]; + uint64_t iLastCount = 0; + uint64_t* piHashVal[MAX_N]; + uint32_t* piNonce[MAX_N]; uint8_t bHashOut[MAX_N * 32]; uint8_t bWorkBlob[sizeof(miner_work::bWorkBlob) * MAX_N]; uint32_t iNonce; job_result res; - for (size_t i = 0; i < N; i++) + for(size_t i = 0; i < N; i++) { ctx[i] = minethd_alloc_ctx(); if(ctx[i] == nullptr) { printer::inst()->print_msg(L0, "ERROR: miner was not able to allocate memory."); - for (int j = 0; j < i; j++) + for(int j = 0; j < i; j++) cryptonight_free_ctx(ctx[j]); win_exit(1); } @@ -863,15 +867,15 @@ void minethd::multiway_work_main() size_t lastPoolId = 0; func_multi_selector(ctx, on_new_job, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str); - while (bQuit == 0) + while(bQuit == 0) { - if (oWork.bStall) + if(oWork.bStall) { /* We are stalled here because the executor didn't find a job for us yet, either because of network latency, or a socket problem. Since we are raison d'etre of this software it us sensible to just wait until we have something*/ - while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) std::this_thread::sleep_for(std::chrono::milliseconds(100)); globalStates::inst().consume_work(oWork, iJobNo); @@ -908,13 +912,12 @@ void minethd::multiway_work_main() if(on_new_job != nullptr) on_new_job(oWork, ctx); - while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) { - if ((iCount++ & 0x7) == 0) //Store stats every 8*N hashes + if((iCount++ & 0x7) == 0) //Store stats every 8*N hashes { - uint64_t iStamp = get_timestamp_ms(); - iHashCount.store(iCount * N, std::memory_order_relaxed); - iTimestamp.store(iStamp, std::memory_order_relaxed); + updateStats((iCount - iLastCount) * N, oWork.iPoolId); + iLastCount = iCount; } nonce_ctr -= N; @@ -927,19 +930,18 @@ void minethd::multiway_work_main() break; } - for (size_t i = 0; i < N; i++) + for(size_t i = 0; i < N; i++) *piNonce[i] = iNonce++; ctx[0]->hash_fn(bWorkBlob, oWork.iWorkSize, bHashOut, ctx, miner_algo); - for (size_t i = 0; i < N; i++) + for(size_t i = 0; i < N; i++) { - if (*piHashVal[i] < oWork.iTarget) + if(*piHashVal[i] < oWork.iTarget) { executor::inst()->push_event( ex_event(job_result(oWork.sJobID, iNonce - N + i, bHashOut + 32 * i, iThreadNo, miner_algo), - oWork.iPoolId) - ); + oWork.iPoolId)); } } @@ -950,7 +952,7 @@ void minethd::multiway_work_main() prep_multiway_work(bWorkBlob, piNonce); } - for (int i = 0; i < N; i++) + for(int i = 0; i < N; i++) cryptonight_free_ctx(ctx[i]); } diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp index 1e25f5d4f..a5201f37a 100644 --- a/xmrstak/backend/cpu/minethd.hpp +++ b/xmrstak/backend/cpu/minethd.hpp @@ -1,15 +1,15 @@ #pragma once -#include "xmrstak/jconf.hpp" #include "crypto/cryptonight.h" -#include "xmrstak/backend/miner_work.hpp" #include "xmrstak/backend/iBackend.hpp" +#include "xmrstak/backend/miner_work.hpp" +#include "xmrstak/jconf.hpp" +#include +#include #include #include #include -#include -#include namespace xmrstak { @@ -18,7 +18,7 @@ namespace cpu class minethd : public iBackend { -public: + public: static std::vector thread_starter(uint32_t threadOffset, miner_work& pWork); static bool self_test(); @@ -29,19 +29,18 @@ class minethd : public iBackend static cryptonight_ctx* minethd_alloc_ctx(); - template + template static void func_multi_selector(cryptonight_ctx**, minethd::cn_on_new_job& on_new_job, - bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str = "off"); + bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str = "off"); - private: - + private: minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version); - template + template void multiway_work_main(); - template - void prep_multiway_work(uint8_t *bWorkBlob, uint32_t **piNonce); + template + void prep_multiway_work(uint8_t* bWorkBlob, uint32_t** piNonce); void work_main(); void double_work_main(); diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index e58665922..262865ea0 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -1,9 +1,9 @@ #pragma once -#include +#include #include -#include +#include #include -#include +#include constexpr size_t start_derived_algo_id = 1000; @@ -15,10 +15,10 @@ enum xmrstak_algo_id cryptonight_monero = 3, cryptonight_heavy = 4, cryptonight_aeon = 5, - cryptonight_ipbc = 6, // equal to cryptonight_aeon with a small tweak in the miner code - cryptonight_stellite = 7, //equal to cryptonight_monero but with one tiny change - cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari - cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak + cryptonight_ipbc = 6, // equal to cryptonight_aeon with a small tweak in the miner code + cryptonight_stellite = 7, //equal to cryptonight_monero but with one tiny change + cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari + cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks cryptonight_monero_v8 = 11, cryptonight_superfast = 12, @@ -42,35 +42,32 @@ enum xmrstak_algo_id inline std::string get_algo_name(xmrstak_algo_id algo_id) { static std::array base_algo_names = - {{ - "invalid_algo", - "cryptonight", - "cryptonight_lite", - "cryptonight_v7", - "cryptonight_heavy", - "cryptonight_lite_v7", - "cryptonight_lite_v7_xor", - "cryptonight_v7_stellite", - "cryptonight_masari", - "cryptonight_haven", - "cryptonight_bittube2", - "cryptonight_v8", - "cryptonight_superfast", - "cryptonight_gpu", - "cryptonight_conceal", - "cryptonight_r_wow", - "cryptonight_r", - "cryptonight_v8_reversewaltz" // used by graft - }}; + {{ + "invalid_algo", + "cryptonight", + "cryptonight_lite", + "cryptonight_v7", + "cryptonight_heavy", + "cryptonight_lite_v7", + "cryptonight_lite_v7_xor", + "cryptonight_v7_stellite", + "cryptonight_masari", + "cryptonight_haven", + "cryptonight_bittube2", + "cryptonight_v8", + "cryptonight_superfast", + "cryptonight_gpu", + "cryptonight_conceal", + "cryptonight_r_wow", + "cryptonight_r", + "cryptonight_v8_reversewaltz" // used by graft + }}; static std::array derived_algo_names = - {{ - "cryptonight_turtle", - "cryptonight_v8_half", // used by masari and stellite - "cryptonight_v8_zelerius", - "cryptonight_v8_double" - }}; - + {{"cryptonight_turtle", + "cryptonight_v8_half", // used by masari and stellite + "cryptonight_v8_zelerius", + "cryptonight_v8_double"}}; if(algo_id < start_derived_algo_id) return base_algo_names[algo_id]; @@ -80,19 +77,35 @@ inline std::string get_algo_name(xmrstak_algo_id algo_id) struct xmrstak_algo { - xmrstak_algo(xmrstak_algo_id name_id) : algo_name(name_id), base_algo(name_id) + xmrstak_algo(xmrstak_algo_id name_id) : + algo_name(name_id), + base_algo(name_id) { } - xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm) : algo_name(name_id), base_algo(algorithm) + xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm) : + algo_name(name_id), + base_algo(algorithm) { } - xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration) : algo_name(name_id), base_algo(algorithm), iter(iteration) + xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration) : + algo_name(name_id), + base_algo(algorithm), + iter(iteration) { } - xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory) + xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory) : + algo_name(name_id), + base_algo(algorithm), + iter(iteration), + mem(memory) { } - xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory, uint32_t mem_mask) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory), mask(mem_mask) + xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory, uint32_t mem_mask) : + algo_name(name_id), + base_algo(algorithm), + iter(iteration), + mem(memory), + mask(mem_mask) { } @@ -187,35 +200,33 @@ constexpr uint32_t CN_DOUBLE_ITER = 0x100000; inline xmrstak_algo POW(xmrstak_algo_id algo_id) { - static std::array pow = {{ - {invalid_algo, invalid_algo}, + static std::array pow = {{{invalid_algo, invalid_algo}, {cryptonight, cryptonight, CN_ITER, CN_MEMORY}, - {cryptonight_lite, cryptonight_lite, CN_ITER/2, CN_MEMORY/2}, + {cryptonight_lite, cryptonight_lite, CN_ITER / 2, CN_MEMORY / 2}, {cryptonight_monero, cryptonight_monero, CN_ITER, CN_MEMORY}, - {cryptonight_heavy, cryptonight_heavy, CN_ITER/2, CN_MEMORY*2}, - {cryptonight_aeon, cryptonight_aeon, CN_ITER/2, CN_MEMORY/2}, - {cryptonight_ipbc, cryptonight_ipbc, CN_ITER/2, CN_MEMORY/2}, // equal to cryptonight_aeon with a small tweak in the miner code - {cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY}, //equal to cryptonight_monero but with one tiny change - {cryptonight_masari, cryptonight_masari, CN_ITER/2, CN_MEMORY}, //equal to cryptonight_monero but with less iterations, used by masari - {cryptonight_haven, cryptonight_haven, CN_ITER/2, CN_MEMORY*2}, // equal to cryptonight_heavy with a small tweak - {cryptonight_bittube2, cryptonight_bittube2, CN_ITER/2, CN_MEMORY*2}, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks + {cryptonight_heavy, cryptonight_heavy, CN_ITER / 2, CN_MEMORY * 2}, + {cryptonight_aeon, cryptonight_aeon, CN_ITER / 2, CN_MEMORY / 2}, + {cryptonight_ipbc, cryptonight_ipbc, CN_ITER / 2, CN_MEMORY / 2}, // equal to cryptonight_aeon with a small tweak in the miner code + {cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY}, //equal to cryptonight_monero but with one tiny change + {cryptonight_masari, cryptonight_masari, CN_ITER / 2, CN_MEMORY}, //equal to cryptonight_monero but with less iterations, used by masari + {cryptonight_haven, cryptonight_haven, CN_ITER / 2, CN_MEMORY * 2}, // equal to cryptonight_heavy with a small tweak + {cryptonight_bittube2, cryptonight_bittube2, CN_ITER / 2, CN_MEMORY * 2}, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks {cryptonight_monero_v8, cryptonight_monero_v8, CN_ITER, CN_MEMORY}, - {cryptonight_superfast, cryptonight_superfast, CN_ITER/4, CN_MEMORY}, + {cryptonight_superfast, cryptonight_superfast, CN_ITER / 4, CN_MEMORY}, {cryptonight_gpu, cryptonight_gpu, CN_GPU_ITER, CN_MEMORY, CN_GPU_MASK}, - {cryptonight_conceal, cryptonight_conceal, CN_ITER/2, CN_MEMORY}, + {cryptonight_conceal, cryptonight_conceal, CN_ITER / 2, CN_MEMORY}, {cryptonight_r_wow, cryptonight_r_wow, CN_ITER, CN_MEMORY}, {cryptonight_r, cryptonight_r, CN_ITER, CN_MEMORY}, - {cryptonight_v8_reversewaltz, cryptonight_v8_reversewaltz, CN_WALTZ_ITER, CN_MEMORY} - }}; + {cryptonight_v8_reversewaltz, cryptonight_v8_reversewaltz, CN_WALTZ_ITER, CN_MEMORY}}}; static std::array derived_pow = - {{ - {cryptonight_turtle, cryptonight_monero_v8, CN_ITER/8, CN_MEMORY/8, CN_TURTLE_MASK}, - {cryptonight_v8_half, cryptonight_monero_v8, CN_ITER/2, CN_MEMORY}, - {cryptonight_v8_zelerius, cryptonight_monero_v8, CN_ZELERIUS_ITER, CN_MEMORY}, - {cryptonight_v8_double, cryptonight_monero_v8, CN_DOUBLE_ITER, CN_MEMORY} - // {cryptonight_derived} - }}; + {{ + {cryptonight_turtle, cryptonight_monero_v8, CN_ITER / 8, CN_MEMORY / 8, CN_TURTLE_MASK}, + {cryptonight_v8_half, cryptonight_monero_v8, CN_ITER / 2, CN_MEMORY}, + {cryptonight_v8_zelerius, cryptonight_monero_v8, CN_ZELERIUS_ITER, CN_MEMORY}, + {cryptonight_v8_double, cryptonight_monero_v8, CN_DOUBLE_ITER, CN_MEMORY} + // {cryptonight_derived} + }}; if(algo_id < start_derived_algo_id) return pow[algo_id]; diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp index 52ef3f391..5b4332ba4 100644 --- a/xmrstak/backend/globalStates.cpp +++ b/xmrstak/backend/globalStates.cpp @@ -21,15 +21,14 @@ * */ -#include "miner_work.hpp" #include "globalStates.hpp" +#include "miner_work.hpp" #include -#include #include +#include #include - namespace xmrstak { diff --git a/xmrstak/backend/globalStates.hpp b/xmrstak/backend/globalStates.hpp index d6966c4a2..a3ff30eea 100644 --- a/xmrstak/backend/globalStates.hpp +++ b/xmrstak/backend/globalStates.hpp @@ -1,10 +1,10 @@ #pragma once #include "xmrstak/backend/miner_work.hpp" -#include "xmrstak/misc/environment.hpp" -#include "xmrstak/misc/console.hpp" #include "xmrstak/backend/pool_data.hpp" #include "xmrstak/cpputil/read_write_lock.h" +#include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/environment.hpp" #include @@ -32,7 +32,7 @@ struct globalStates nonce = iGlobalNonce.fetch_add(reserve_count); } - void consume_work( miner_work& threadWork, uint64_t& currentJobId); + void consume_work(miner_work& threadWork, uint64_t& currentJobId); miner_work oGlobalWork; std::atomic iGlobalJobNo; @@ -41,8 +41,11 @@ struct globalStates uint64_t iThreadCount; size_t pool_id = invalid_pool_id; -private: - globalStates() : iThreadCount(0), iGlobalJobNo(0), iConsumeCnt(0) + private: + globalStates() : + iThreadCount(0), + iGlobalJobNo(0), + iConsumeCnt(0) { } diff --git a/xmrstak/backend/iBackend.hpp b/xmrstak/backend/iBackend.hpp index 18411b79c..dd59b6c52 100644 --- a/xmrstak/backend/iBackend.hpp +++ b/xmrstak/backend/iBackend.hpp @@ -1,12 +1,13 @@ #pragma once #include "xmrstak/backend/globalStates.hpp" +#include "xmrstak/net/msgstruct.hpp" #include -#include #include -#include +#include #include +#include template constexpr std::size_t countof(T const (&)[N]) noexcept @@ -16,35 +17,65 @@ constexpr std::size_t countof(T const (&)[N]) noexcept namespace xmrstak { - struct iBackend +struct iBackend +{ + + enum BackendType : uint32_t { + UNKNOWN = 0u, + CPU = 1u, + AMD = 2u, + NVIDIA = 3u + }; - enum BackendType : uint32_t { UNKNOWN = 0u, CPU = 1u, AMD = 2u, NVIDIA = 3u }; + static const char* getName(const BackendType type) + { + const char* backendNames[] = { + "unknown", + "cpu", + "amd", + "nvidia"}; - static const char* getName(const BackendType type) - { - const char* backendNames[] = { - "unknown", - "cpu", - "amd", - "nvidia" - }; - - uint32_t i = static_cast(type); - if(i >= countof(backendNames)) - i = 0; - - return backendNames[i]; - } + uint32_t i = static_cast(type); + if(i >= countof(backendNames)) + i = 0; + + return backendNames[i]; + } - std::atomic iHashCount; - std::atomic iTimestamp; - uint32_t iThreadNo; - BackendType backendType = UNKNOWN; + std::atomic iHashCount; + std::atomic iTimestamp; + uint32_t iThreadNo; + BackendType backendType = UNKNOWN; + uint64_t iLastStamp = get_timestamp_ms(); + double avgHashPerMsec = 0.0; - iBackend() : iHashCount(0), iTimestamp(0) + void updateStats(uint64_t numNewHashes, size_t poolId) + { + uint64_t iStamp = get_timestamp_ms(); + double timeDiff = static_cast(iStamp - iLastStamp); + iLastStamp = iStamp; + + if(poolId == 0) { + // if dev pool is active interpolate the number of shares (avoid hash rate drops) + numNewHashes = static_cast(avgHashPerMsec * timeDiff); } - }; + else + { + const double hashRatePerMs = static_cast(numNewHashes) / timeDiff; + constexpr double averagingBias = 0.1; + avgHashPerMsec = avgHashPerMsec * (1.0 - averagingBias) + hashRatePerMs * averagingBias; + } + iHashCount.fetch_add(numNewHashes, std::memory_order_relaxed); + iTimestamp.store(iStamp, std::memory_order_relaxed); + } + + iBackend() : + iHashCount(0), + iTimestamp(0) + { + } +}; } // namespace xmrstak diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp index d0e5237f2..114f2db8e 100644 --- a/xmrstak/backend/miner_work.hpp +++ b/xmrstak/backend/miner_work.hpp @@ -2,95 +2,110 @@ #include "xmrstak/backend/pool_data.hpp" -#include #include -#include -#include -#include #include +#include #include +#include +#include +#include namespace xmrstak { - struct miner_work +struct miner_work +{ + char sJobID[64]; + uint8_t bWorkBlob[128]; + uint32_t iWorkSize; + uint64_t iTarget; + bool bNiceHash; + bool bStall; + size_t iPoolId; + uint64_t iBlockHeight; + uint8_t* ref_ptr; + + miner_work() : + iWorkSize(0), + bNiceHash(false), + bStall(true), + iPoolId(invalid_pool_id), + ref_ptr((uint8_t*)&iBlockHeight) {} + + miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize, + uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) : + iWorkSize(iWorkSize), + iTarget(iTarget), + bNiceHash(bNiceHash), + bStall(false), + iPoolId(iPoolId), + iBlockHeight(iBlockHeiht), + ref_ptr((uint8_t*)&iBlockHeight) { - char sJobID[64]; - uint8_t bWorkBlob[128]; - uint32_t iWorkSize; - uint64_t iTarget; - bool bNiceHash; - bool bStall; - size_t iPoolId; - uint64_t iBlockHeight; - uint8_t* ref_ptr; - - miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(invalid_pool_id), ref_ptr((uint8_t*)&iBlockHeight) { } - - miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize, - uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) : iWorkSize(iWorkSize), - iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId), iBlockHeight(iBlockHeiht), ref_ptr((uint8_t*)&iBlockHeight) - { - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(this->bWorkBlob, bWork, iWorkSize); - memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); - } - - miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget), - bStall(from.bStall), iPoolId(from.iPoolId), iBlockHeight(from.iBlockHeight), ref_ptr((uint8_t*)&iBlockHeight) - { - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); - memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); - } - - miner_work(miner_work const&) = delete; - - miner_work& operator=(miner_work&& from) - { - assert(this != &from); - - iBlockHeight = from.iBlockHeight; - iPoolId = from.iPoolId; - bStall = from.bStall; - iWorkSize = from.iWorkSize; - bNiceHash = from.bNiceHash; - iTarget = from.iTarget; - - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(sJobID, from.sJobID, sizeof(sJobID)); - memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); + assert(iWorkSize <= sizeof(bWorkBlob)); + memcpy(this->bWorkBlob, bWork, iWorkSize); + memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); + } + + miner_work(miner_work&& from) : + iWorkSize(from.iWorkSize), + iTarget(from.iTarget), + bStall(from.bStall), + iPoolId(from.iPoolId), + iBlockHeight(from.iBlockHeight), + ref_ptr((uint8_t*)&iBlockHeight) + { + assert(iWorkSize <= sizeof(bWorkBlob)); + memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); + memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID)); + } - return *this; - } + miner_work(miner_work const&) = delete; + + miner_work& operator=(miner_work&& from) + { + assert(this != &from); - miner_work& operator=(miner_work const& from) - { - assert(this != &from); + iBlockHeight = from.iBlockHeight; + iPoolId = from.iPoolId; + bStall = from.bStall; + iWorkSize = from.iWorkSize; + bNiceHash = from.bNiceHash; + iTarget = from.iTarget; - iBlockHeight = from.iBlockHeight; - iPoolId = from.iPoolId; - bStall = from.bStall; - iWorkSize = from.iWorkSize; - bNiceHash = from.bNiceHash; - iTarget = from.iTarget; + assert(iWorkSize <= sizeof(bWorkBlob)); + memcpy(sJobID, from.sJobID, sizeof(sJobID)); + memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); - if(!ref_ptr) - return *this; + return *this; + } - for(size_t i=0; i <= 7 && iPoolId; i++) - ref_ptr[i] = from.ref_ptr[7-i]; + miner_work& operator=(miner_work const& from) + { + assert(this != &from); - assert(iWorkSize <= sizeof(bWorkBlob)); - memcpy(sJobID, from.sJobID, sizeof(sJobID)); - memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); + iBlockHeight = from.iBlockHeight; + iPoolId = from.iPoolId; + bStall = from.bStall; + iWorkSize = from.iWorkSize; + bNiceHash = from.bNiceHash; + iTarget = from.iTarget; + if(!ref_ptr) return *this; - } - uint8_t getVersion() const - { - return bWorkBlob[0]; - } + for(size_t i = 0; i <= 7 && iPoolId; i++) + ref_ptr[i] = from.ref_ptr[7 - i]; + + assert(iWorkSize <= sizeof(bWorkBlob)); + memcpy(sJobID, from.sJobID, sizeof(sJobID)); + memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); - }; + return *this; + } + + uint8_t getVersion() const + { + return bWorkBlob[0]; + } +}; } // namespace xmrstak diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp index f1bf75819..a7587cbe0 100644 --- a/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp +++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp @@ -14,17 +14,17 @@ * */ -#include -#include -#include #include +#include #include +#include +#include #include -#include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp" #include "xmrstak/backend/cpu/crypto/variant4_random_math.h" -#include "xmrstak/misc/console.hpp" +#include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp" #include "xmrstak/cpputil/read_write_lock.h" +#include "xmrstak/misc/console.hpp" namespace xmrstak { @@ -33,80 +33,82 @@ namespace nvidia static std::string get_code(const V4_Instruction* code, int code_size) { - std::stringstream s; + std::stringstream s; - for (int i = 0; i < code_size; ++i) - { - const V4_Instruction inst = code[i]; + for(int i = 0; i < code_size; ++i) + { + const V4_Instruction inst = code[i]; - const uint32_t a = inst.dst_index; - const uint32_t b = inst.src_index; + const uint32_t a = inst.dst_index; + const uint32_t b = inst.src_index; - switch (inst.opcode) - { - case MUL: - s << 'r' << a << "*=r" << b << ';'; - break; + switch(inst.opcode) + { + case MUL: + s << 'r' << a << "*=r" << b << ';'; + break; - case ADD: - s << 'r' << a << "+=r" << b << '+' << inst.C << "U;"; - break; + case ADD: + s << 'r' << a << "+=r" << b << '+' << inst.C << "U;"; + break; - case SUB: - s << 'r' << a << "-=r" << b << ';'; - break; + case SUB: + s << 'r' << a << "-=r" << b << ';'; + break; - case ROR: - s << 'r' << a << "=rotate_right(r" << a << ",r" << b << ");"; - break; + case ROR: + s << 'r' << a << "=rotate_right(r" << a << ",r" << b << ");"; + break; - case ROL: - s << 'r' << a << "=rotate_left(r" << a << ",r" << b << ");"; - break; + case ROL: + s << 'r' << a << "=rotate_left(r" << a << ",r" << b << ");"; + break; - case XOR: - s << 'r' << a << "^=r" << b << ';'; - break; - } + case XOR: + s << 'r' << a << "^=r" << b << ';'; + break; + } - s << '\n'; - } + s << '\n'; + } - return s.str(); + return s.str(); } struct CacheEntry { - CacheEntry(xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, const std::vector& ptx, const std::string& lowered_name) : - algo(algo), - height(height), - arch_major(arch_major), - arch_minor(arch_minor), - ptx(ptx), - lowered_name(lowered_name) - {} - - xmrstak_algo algo; - uint64_t height; - int arch_major; - int arch_minor; - std::vector ptx; - std::string lowered_name; + CacheEntry(xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, const std::vector& ptx, const std::string& lowered_name) : + algo(algo), + height(height), + arch_major(arch_major), + arch_minor(arch_minor), + ptx(ptx), + lowered_name(lowered_name) + { + } + + xmrstak_algo algo; + uint64_t height; + int arch_major; + int arch_minor; + std::vector ptx; + std::string lowered_name; }; struct BackgroundTaskBase { - virtual ~BackgroundTaskBase() {} - virtual void exec() = 0; + virtual ~BackgroundTaskBase() {} + virtual void exec() = 0; }; -template +template struct BackgroundTask : public BackgroundTaskBase { - BackgroundTask(T&& func) : m_func(std::move(func)) {} - void exec() override { m_func(); } + BackgroundTask(T&& func) : + m_func(std::move(func)) {} + void exec() override { m_func(); } - T m_func; + T m_func; }; static ::cpputil::RWLock CryptonightR_cache_mutex; @@ -119,155 +121,165 @@ static std::thread* background_thread = nullptr; static void background_thread_proc() { - std::vector tasks; - for (;;) { - tasks.clear(); - { - std::lock_guard g(background_tasks_mutex); - background_tasks.swap(tasks); - } - - for (BackgroundTaskBase* task : tasks) { - task->exec(); - delete task; - } - - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - } + std::vector tasks; + for(;;) + { + tasks.clear(); + { + std::lock_guard g(background_tasks_mutex); + background_tasks.swap(tasks); + } + + for(BackgroundTaskBase* task : tasks) + { + task->exec(); + delete task; + } + + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } } -template +template static void background_exec(T&& func) { - BackgroundTaskBase* task = new BackgroundTask(std::move(func)); - - std::lock_guard g(background_tasks_mutex); - background_tasks.push_back(task); - if (!background_thread) { - background_thread = new std::thread(background_thread_proc); - } + BackgroundTaskBase* task = new BackgroundTask(std::move(func)); + + std::lock_guard g(background_tasks_mutex); + background_tasks.push_back(task); + if(!background_thread) + { + background_thread = new std::thread(background_thread_proc); + } } static void CryptonightR_build_program( - std::vector& ptx, - std::string& lowered_name, - const xmrstak_algo& algo, - uint64_t height, - uint32_t precompile_count, - int arch_major, - int arch_minor, - std::string source) + std::vector& ptx, + std::string& lowered_name, + const xmrstak_algo& algo, + uint64_t height, + uint32_t precompile_count, + int arch_major, + int arch_minor, + std::string source) { - { + { CryptonightR_cache_mutex.WriteLock(); - // Remove old programs from cache - for (size_t i = 0; i < CryptonightR_cache.size();) - { - const CacheEntry& entry = CryptonightR_cache[i]; - if ((entry.algo == algo) && (entry.height + 2 + precompile_count < height)) - { - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height); - CryptonightR_cache[i] = std::move(CryptonightR_cache.back()); - CryptonightR_cache.pop_back(); - } - else - { - ++i; - } - } + // Remove old programs from cache + for(size_t i = 0; i < CryptonightR_cache.size();) + { + const CacheEntry& entry = CryptonightR_cache[i]; + if((entry.algo == algo) && (entry.height + 2 + precompile_count < height)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height); + CryptonightR_cache[i] = std::move(CryptonightR_cache.back()); + CryptonightR_cache.pop_back(); + } + else + { + ++i; + } + } CryptonightR_cache_mutex.UnLock(); - } + } - ptx.clear(); - ptx.reserve(65536); + ptx.clear(); + ptx.reserve(65536); - std::lock_guard g1(CryptonightR_build_mutex); - { + std::lock_guard g1(CryptonightR_build_mutex); + { CryptonightR_cache_mutex.ReadLock(); - // Check if the cache already has this program (some other thread might have added it first) - for (const CacheEntry& entry : CryptonightR_cache) - { - if ((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor)) - { - ptx = entry.ptx; - lowered_name = entry.lowered_name; + // Check if the cache already has this program (some other thread might have added it first) + for(const CacheEntry& entry : CryptonightR_cache) + { + if((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor)) + { + ptx = entry.ptx; + lowered_name = entry.lowered_name; CryptonightR_cache_mutex.UnLock(); - return; - } - } + return; + } + } CryptonightR_cache_mutex.UnLock(); - } - - nvrtcProgram prog; - nvrtcResult result = nvrtcCreateProgram(&prog, source.c_str(), "CryptonightR.curt", 0, NULL, NULL); - if (result != NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "nvrtcCreateProgram failed: %s", nvrtcGetErrorString(result)); - return; - } - - result = nvrtcAddNameExpression(prog, "CryptonightR_phase2"); - if (result != NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "nvrtcAddNameExpression failed: %s", nvrtcGetErrorString(result)); - nvrtcDestroyProgram(&prog); - return; - } - - char opt0[64]; - sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor); - - char opt1[64]; - sprintf(opt1, "-DALGO=%d", static_cast(algo.Id())); - - const char* opts[2] = { opt0, opt1 }; - - result = nvrtcCompileProgram(prog, 2, opts); - if (result != NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "nvrtcCompileProgram failed: %s", nvrtcGetErrorString(result)); - - size_t logSize; - if (nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS) { - char *log = new char[logSize]; - if (nvrtcGetProgramLog(prog, log) == NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "Program compile log: %s", log); - } - delete[]log; - } - nvrtcDestroyProgram(&prog); - return; - } - - - const char* name; - result = nvrtcGetLoweredName(prog, "CryptonightR_phase2", &name); - if (result != NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "nvrtcGetLoweredName failed: %s", nvrtcGetErrorString(result)); - nvrtcDestroyProgram(&prog); - return; - } - - size_t ptxSize; - result = nvrtcGetPTXSize(prog, &ptxSize); - if (result != NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "nvrtcGetPTXSize failed: %s", nvrtcGetErrorString(result)); - nvrtcDestroyProgram(&prog); - return; - } - - ptx.resize(ptxSize); - result = nvrtcGetPTX(prog, ptx.data()); - if (result != NVRTC_SUCCESS) { - printer::inst()->print_msg(L0, "nvrtcGetPTX failed: %s", nvrtcGetErrorString(result)); - nvrtcDestroyProgram(&prog); - return; - } - - lowered_name = name; - - nvrtcDestroyProgram(&prog); - - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height); + } + + nvrtcProgram prog; + nvrtcResult result = nvrtcCreateProgram(&prog, source.c_str(), "CryptonightR.curt", 0, NULL, NULL); + if(result != NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "nvrtcCreateProgram failed: %s", nvrtcGetErrorString(result)); + return; + } + + result = nvrtcAddNameExpression(prog, "CryptonightR_phase2"); + if(result != NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "nvrtcAddNameExpression failed: %s", nvrtcGetErrorString(result)); + nvrtcDestroyProgram(&prog); + return; + } + + char opt0[64]; + sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor); + + char opt1[64]; + sprintf(opt1, "-DALGO=%d", static_cast(algo.Id())); + + const char* opts[2] = {opt0, opt1}; + + result = nvrtcCompileProgram(prog, 2, opts); + if(result != NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "nvrtcCompileProgram failed: %s", nvrtcGetErrorString(result)); + + size_t logSize; + if(nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS) + { + char* log = new char[logSize]; + if(nvrtcGetProgramLog(prog, log) == NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "Program compile log: %s", log); + } + delete[] log; + } + nvrtcDestroyProgram(&prog); + return; + } + + const char* name; + result = nvrtcGetLoweredName(prog, "CryptonightR_phase2", &name); + if(result != NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "nvrtcGetLoweredName failed: %s", nvrtcGetErrorString(result)); + nvrtcDestroyProgram(&prog); + return; + } + + size_t ptxSize; + result = nvrtcGetPTXSize(prog, &ptxSize); + if(result != NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "nvrtcGetPTXSize failed: %s", nvrtcGetErrorString(result)); + nvrtcDestroyProgram(&prog); + return; + } + + ptx.resize(ptxSize); + result = nvrtcGetPTX(prog, ptx.data()); + if(result != NVRTC_SUCCESS) + { + printer::inst()->print_msg(L0, "nvrtcGetPTX failed: %s", nvrtcGetErrorString(result)); + nvrtcDestroyProgram(&prog); + return; + } + + lowered_name = name; + + nvrtcDestroyProgram(&prog); + + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height); CryptonightR_cache_mutex.WriteLock(); CryptonightR_cache.emplace_back(algo, height, arch_major, arch_minor, ptx, lowered_name); @@ -276,62 +288,63 @@ static void CryptonightR_build_program( void CryptonightR_get_program(std::vector& ptx, std::string& lowered_name, const xmrstak_algo algo, uint64_t height, uint32_t precompile_count, int arch_major, int arch_minor, bool background) { - if (background) { - background_exec([=]() { std::vector tmp; std::string s; CryptonightR_get_program(tmp, s, algo, height, precompile_count, arch_major, arch_minor, false); }); - return; - } - - ptx.clear(); - - const char* source_code_template = - #include "nvcc_code/cuda_cryptonight_r.curt" - ; - const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH"; - const char* offset = strstr(source_code_template, include_name); - if (!offset) - { - printer::inst()->print_msg(L0, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cuda_cryptonight_r.curt"); - return; - } - - V4_Instruction code[256]; - int code_size; - switch (algo.Id()) - { - case cryptonight_r_wow: - code_size = v4_random_math_init(code, height); - break; - case cryptonight_r: - code_size = v4_random_math_init(code, height); - break; - printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo); - return; - } - - std::string source_code(source_code_template, offset); - source_code.append(get_code(code, code_size)); - source_code.append(offset + sizeof(include_name) - 1); - - { + if(background) + { + background_exec([=]() { std::vector tmp; std::string s; CryptonightR_get_program(tmp, s, algo, height, precompile_count, arch_major, arch_minor, false); }); + return; + } + + ptx.clear(); + + const char* source_code_template = +#include "nvcc_code/cuda_cryptonight_r.curt" + ; + const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH"; + const char* offset = strstr(source_code_template, include_name); + if(!offset) + { + printer::inst()->print_msg(L0, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cuda_cryptonight_r.curt"); + return; + } + + V4_Instruction code[256]; + int code_size; + switch(algo.Id()) + { + case cryptonight_r_wow: + code_size = v4_random_math_init(code, height); + break; + case cryptonight_r: + code_size = v4_random_math_init(code, height); + break; + printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo); + return; + } + + std::string source_code(source_code_template, offset); + source_code.append(get_code(code, code_size)); + source_code.append(offset + sizeof(include_name) - 1); + + { CryptonightR_cache_mutex.ReadLock(); - // Check if the cache has this program - for (const CacheEntry& entry : CryptonightR_cache) - { - if ((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor)) - { - printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height); - ptx = entry.ptx; - lowered_name = entry.lowered_name; + // Check if the cache has this program + for(const CacheEntry& entry : CryptonightR_cache) + { + if((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor)) + { + printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height); + ptx = entry.ptx; + lowered_name = entry.lowered_name; CryptonightR_cache_mutex.UnLock(); - return; - } - } + return; + } + } CryptonightR_cache_mutex.UnLock(); - } + } - CryptonightR_build_program(ptx, lowered_name, algo, height, precompile_count, arch_major, arch_minor, source_code); + CryptonightR_build_program(ptx, lowered_name, algo, height, precompile_count, arch_major, arch_minor, source_code); } +} // namespace nvidia } // namespace xmrstak -} //namespace nvidia diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp index c3d8827b0..30abf2e59 100644 --- a/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp +++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp @@ -19,9 +19,8 @@ #include "xmrstak/backend/cryptonight.hpp" #include -#include #include - +#include namespace xmrstak { @@ -29,9 +28,7 @@ namespace nvidia { void CryptonightR_get_program(std::vector& ptx, std::string& lowered_name, - const xmrstak_algo algo, uint64_t height, uint32_t precompile_count, int arch_major, int arch_minor, bool background = false); - + const xmrstak_algo algo, uint64_t height, uint32_t precompile_count, int arch_major, int arch_minor, bool background = false); +} // namespace nvidia } // namespace xmrstak -} //namespace nvidia - diff --git a/xmrstak/backend/nvidia/autoAdjust.hpp b/xmrstak/backend/nvidia/autoAdjust.hpp index 2755e03d2..a7f35b18b 100644 --- a/xmrstak/backend/nvidia/autoAdjust.hpp +++ b/xmrstak/backend/nvidia/autoAdjust.hpp @@ -3,17 +3,16 @@ #include "autoAdjust.hpp" -#include "nvcc_code/cryptonight.hpp" #include "jconf.hpp" -#include "xmrstak/misc/console.hpp" +#include "nvcc_code/cryptonight.hpp" #include "xmrstak/misc/configEditor.hpp" +#include "xmrstak/misc/console.hpp" #include "xmrstak/params.hpp" -#include #include #include #include - +#include namespace xmrstak { @@ -22,11 +21,9 @@ namespace nvidia class autoAdjust { -public: - + public: autoAdjust() { - } /** print the adjusted values if needed @@ -63,25 +60,22 @@ class autoAdjust nvidCtxVec.push_back(ctx); else printer::inst()->print_msg(L0, "WARNING: NVIDIA setup failed for GPU %d.\n", i); - } generateThreadConfig(); return true; - } -private: - + private: void generateThreadConfig() { // load the template of the backend config into a char variable - const char *tpl = - #include "./config.tpl" - ; + const char* tpl = +#include "./config.tpl" + ; configEditor configTpl{}; - configTpl.set( std::string(tpl) ); + configTpl.set(std::string(tpl)); constexpr size_t byte2mib = 1024u * 1024u; std::string conf; @@ -90,18 +84,18 @@ class autoAdjust if(ctx.device_threads * ctx.device_blocks > 0) { conf += std::string(" // gpu: ") + ctx.name + " architecture: " + std::to_string(ctx.device_arch[0] * 10 + ctx.device_arch[1]) + "\n"; - conf += std::string(" // memory: ") + std::to_string(ctx.free_device_memory / byte2mib) + "/" + std::to_string(ctx.total_device_memory / byte2mib) + " MiB\n"; + conf += std::string(" // memory: ") + std::to_string(ctx.free_device_memory / byte2mib) + "/" + std::to_string(ctx.total_device_memory / byte2mib) + " MiB\n"; conf += std::string(" // smx: ") + std::to_string(ctx.device_mpcount) + "\n"; conf += std::string(" { \"index\" : ") + std::to_string(ctx.device_id) + ",\n" + - " \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" + - " \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" + - " \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" + - " \"mem_mode\" : 1,\n" + - " },\n"; + " \"threads\" : " + std::to_string(ctx.device_threads) + ", \"blocks\" : " + std::to_string(ctx.device_blocks) + ",\n" + + " \"bfactor\" : " + std::to_string(ctx.device_bfactor) + ", \"bsleep\" : " + std::to_string(ctx.device_bsleep) + ",\n" + + " \"affine_to_cpu\" : false, \"sync_mode\" : 3,\n" + + " \"mem_mode\" : 1,\n" + + " },\n"; } } - configTpl.replace("GPUCONFIG",conf); + configTpl.replace("GPUCONFIG", conf); configTpl.write(params::inst().configFileNVIDIA); printer::inst()->print_msg(L0, "NVIDIA: GPU configuration stored in file '%s'", params::inst().configFileNVIDIA.c_str()); } diff --git a/xmrstak/backend/nvidia/jconf.cpp b/xmrstak/backend/nvidia/jconf.cpp index 6c443343b..1cd113c4d 100644 --- a/xmrstak/backend/nvidia/jconf.cpp +++ b/xmrstak/backend/nvidia/jconf.cpp @@ -22,8 +22,8 @@ */ #include "jconf.hpp" -#include "xmrstak/misc/jext.hpp" #include "xmrstak/misc/console.hpp" +#include "xmrstak/misc/jext.hpp" #include #include @@ -36,7 +36,6 @@ #include #endif - namespace xmrstak { namespace nvidia @@ -47,9 +46,13 @@ using namespace rapidjson; /* * This enum needs to match index in oConfigValues, otherwise we will get a runtime error */ -enum configEnum { aGpuThreadsConf }; +enum configEnum +{ + aGpuThreadsConf +}; -struct configVal { +struct configVal +{ configEnum iName; const char* sName; Type iType; @@ -58,8 +61,7 @@ struct configVal { // Same order as in configEnum, as per comment above // kNullType means any type configVal oConfigValues[] = { - { aGpuThreadsConf, "gpu_threads_conf", kNullType } -}; + {aGpuThreadsConf, "gpu_threads_conf", kNullType}}; inline bool checkType(Type have, Type want) { @@ -75,9 +77,7 @@ inline bool checkType(Type have, Type want) return false; } -constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); - - +constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0])); struct jconf::opaque_private { @@ -89,7 +89,6 @@ struct jconf::opaque_private } }; - bool jconf::NeedsAutoconf() { return !prv->configValues[aGpuThreadsConf]->IsArray(); @@ -110,7 +109,7 @@ size_t jconf::GetGPUThreadCount() return 0; } -bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) +bool jconf::GetGPUThreadConfig(size_t id, thd_cfg& cfg) { if(!prv->configValues[aGpuThreadsConf]->IsArray()) return false; @@ -170,7 +169,6 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) return false; } - cfg.id = gid->GetInt(); cfg.blocks = blocks->GetInt(); cfg.threads = threads->GetInt(); @@ -178,7 +176,7 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) cfg.bsleep = bsleep->GetInt(); cfg.syncMode = syncMode->GetInt(); cfg.memMode = memMode->GetInt(); - + if(aff->IsNumber()) cfg.cpu_aff = aff->GetInt(); else @@ -189,22 +187,22 @@ bool jconf::GetGPUThreadConfig(size_t id, thd_cfg &cfg) bool jconf::parse_config(const char* sFilename) { - FILE * pFile; - char * buffer; + FILE* pFile; + char* buffer; size_t flen; pFile = fopen(sFilename, "rb"); - if (pFile == NULL) + if(pFile == NULL) { printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename); return false; } - fseek(pFile,0,SEEK_END); + fseek(pFile, 0, SEEK_END); flen = ftell(pFile); rewind(pFile); - if(flen >= 64*1024) + if(flen >= 64 * 1024) { fclose(pFile); printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename); @@ -219,7 +217,7 @@ bool jconf::parse_config(const char* sFilename) } buffer = (char*)malloc(flen + 3); - if(fread(buffer+1, flen, 1, pFile) != 1) + if(fread(buffer + 1, flen, 1, pFile) != 1) { free(buffer); fclose(pFile); @@ -241,7 +239,7 @@ bool jconf::parse_config(const char* sFilename) buffer[flen] = '}'; buffer[flen + 1] = '\0'; - prv->jsonDoc.Parse(buffer, flen+2); + prv->jsonDoc.Parse(buffer, flen + 2); free(buffer); if(prv->jsonDoc.HasParseError()) @@ -251,7 +249,6 @@ bool jconf::parse_config(const char* sFilename) return false; } - if(!prv->jsonDoc.IsObject()) { //This should never happen as we created the root ourselves printer::inst()->print_msg(L0, "Invalid config file '%s'. No root?", sFilename); @@ -262,7 +259,7 @@ bool jconf::parse_config(const char* sFilename) { if(oConfigValues[i].iName != i) { - printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order. %s",oConfigValues[i].sName); + printer::inst()->print_msg(L0, "Code error. oConfigValues are not in order. %s", oConfigValues[i].sName); return false; } diff --git a/xmrstak/backend/nvidia/jconf.hpp b/xmrstak/backend/nvidia/jconf.hpp index 40b72f880..e924c75a9 100644 --- a/xmrstak/backend/nvidia/jconf.hpp +++ b/xmrstak/backend/nvidia/jconf.hpp @@ -1,7 +1,7 @@ #pragma once +#include "xmrstak/params.hpp" #include #include -#include "xmrstak/params.hpp" namespace xmrstak { @@ -10,16 +10,18 @@ namespace nvidia class jconf { -public: + public: static jconf* inst() { - if (oInst == nullptr) oInst = new jconf; + if(oInst == nullptr) + oInst = new jconf; return oInst; }; bool parse_config(const char* sFilename = params::inst().configFileNVIDIA.c_str()); - struct thd_cfg { + struct thd_cfg + { uint32_t id; uint32_t blocks; uint32_t threads; @@ -36,17 +38,16 @@ class jconf size_t GetGPUThreadCount(); - bool GetGPUThreadConfig(size_t id, thd_cfg &cfg); + bool GetGPUThreadConfig(size_t id, thd_cfg& cfg); bool NeedsAutoconf(); -private: + private: jconf(); static jconf* oInst; struct opaque_private; opaque_private* prv; - }; } // namespace nvidia diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index 80615d7a3..32b21dc71 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -23,23 +23,23 @@ #include "minethd.hpp" #include "autoAdjust.hpp" -#include "xmrstak/misc/console.hpp" -#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h" #include "xmrstak/backend/cpu/crypto/cryptonight.h" +#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h" +#include "xmrstak/backend/cpu/hwlocMemory.hpp" #include "xmrstak/backend/cpu/minethd.hpp" -#include "xmrstak/params.hpp" -#include "xmrstak/misc/executor.hpp" +#include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/jconf.hpp" +#include "xmrstak/misc/console.hpp" #include "xmrstak/misc/environment.hpp" -#include "xmrstak/backend/cpu/hwlocMemory.hpp" -#include "xmrstak/backend/cryptonight.hpp" +#include "xmrstak/misc/executor.hpp" #include "xmrstak/misc/utility.hpp" +#include "xmrstak/params.hpp" #include -#include +#include #include +#include #include -#include #include #ifndef USE_PRECOMPILED_HEADERS @@ -47,8 +47,8 @@ #include #include #else -#include #include +#include #endif #include #endif @@ -59,9 +59,9 @@ namespace nvidia { #ifdef WIN32 - HINSTANCE lib_handle; +HINSTANCE lib_handle; #else - void *lib_handle; +void* lib_handle; #endif minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg) @@ -101,23 +101,21 @@ void minethd::start_mining() printer::inst()->print_msg(L1, "WARNING setting affinity failed."); } - bool minethd::self_test() { return true; } - extern "C" { #ifdef WIN32 -__declspec(dllexport) + __declspec(dllexport) #endif -std::vector* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env) -{ - environment::inst(&env); - return nvidia::minethd::thread_starter(threadOffset, pWork); -} + std::vector* xmrstak_start_backend(uint32_t threadOffset, miner_work& pWork, environment& env) + { + environment::inst(&env); + return nvidia::minethd::thread_starter(threadOffset, pWork); + } } // extern "C" std::vector* minethd::thread_starter(uint32_t threadOffset, miner_work& pWork) @@ -141,12 +139,12 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor int deviceCount = 0; if(cuda_get_devicecount(&deviceCount) != 1) { - std::cout<<"WARNING: NVIDIA no device found"<GetGPUThreadCount(); @@ -155,7 +153,7 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor cuInit(0); jconf::thd_cfg cfg; - for (i = 0; i < n; i++) + for(i = 0; i < n; i++) { jconf::inst()->GetGPUThreadConfig(i, cfg); @@ -172,10 +170,9 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor minethd* thd = new minethd(pWork, i + threadOffset, cfg); pvThreads->push_back(thd); - } - for (i = 0; i < n; i++) + for(i = 0; i < n; i++) { static_cast((*pvThreads)[i])->start_mining(); } @@ -201,7 +198,6 @@ void minethd::work_main() // wait until all NVIDIA devices are initialized thread_work_guard.wait(); - uint64_t iCount = 0; cryptonight_ctx* cpu_ctx; cpu_ctx = cpu::minethd::minethd_alloc_ctx(); @@ -216,16 +212,16 @@ void minethd::work_main() uint8_t version = 0; size_t lastPoolId = 0; - while (bQuit == 0) + while(bQuit == 0) { - if (oWork.bStall) + if(oWork.bStall) { /* We are stalled here because the executor didn't find a job for us yet, * either because of network latency, or a socket problem. Since we are * raison d'etre of this software it us sensible to just wait until we have something */ - while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) std::this_thread::sleep_for(std::chrono::milliseconds(100)); globalStates::inst().consume_work(oWork, iJobNo); @@ -285,8 +281,8 @@ void minethd::work_main() for(size_t i = 0; i < foundCount; i++) { - uint8_t bWorkBlob[128]; - uint8_t bResult[32]; + uint8_t bWorkBlob[128]; + uint8_t bResult[32]; memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize); memset(bResult, 0, sizeof(job_result::bResult)); @@ -294,19 +290,14 @@ void minethd::work_main() *(uint32_t*)(bWorkBlob + 39) = foundNonce[i]; cpu_ctx->hash_fn(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo); - if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget) + if((*((uint64_t*)(bResult + 24))) < oWork.iTarget) executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo, miner_algo), oWork.iPoolId)); else executor::inst()->push_event(ex_event("NVIDIA Invalid Result", ctx.device_id, oWork.iPoolId)); } - iCount += h_per_round; iNonce += h_per_round; - - using namespace std::chrono; - uint64_t iStamp = get_timestamp_ms(); - iHashCount.store(iCount, std::memory_order_relaxed); - iTimestamp.store(iStamp, std::memory_order_relaxed); + updateStats(h_per_round, oWork.iPoolId); std::this_thread::yield(); } @@ -314,5 +305,5 @@ void minethd::work_main() } } +} // namespace nvidia } // namespace xmrstak -} //namespace nvidia diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp index 3863c93e8..66c49bb1f 100644 --- a/xmrstak/backend/nvidia/minethd.hpp +++ b/xmrstak/backend/nvidia/minethd.hpp @@ -1,19 +1,18 @@ #pragma once -#include "xmrstak/jconf.hpp" #include "jconf.hpp" #include "nvcc_code/cryptonight.hpp" +#include "xmrstak/jconf.hpp" #include "xmrstak/backend/cpu/minethd.hpp" #include "xmrstak/backend/iBackend.hpp" #include "xmrstak/misc/environment.hpp" +#include +#include #include #include -#include #include -#include - namespace xmrstak { @@ -22,12 +21,11 @@ namespace nvidia class minethd : public iBackend { -public: - + public: static std::vector* thread_starter(uint32_t threadOffset, miner_work& pWork); static bool self_test(); -private: + private: typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&); minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg); diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp index 906701893..78abd7a3d 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp @@ -3,14 +3,15 @@ #include #include -#include "xmrstak/jconf.hpp" #include "xmrstak/backend/cryptonight.hpp" +#include "xmrstak/jconf.hpp" #include -typedef struct { +typedef struct +{ int device_id; - const char *device_name; + const char* device_name; int device_arch[2]; int device_mpcount; int device_blocks; @@ -20,18 +21,18 @@ typedef struct { int syncMode; bool memMode; - uint32_t *d_input; + uint32_t* d_input; uint32_t inputlen; - uint32_t *d_result_count; - uint32_t *d_result_nonce; - uint32_t *d_long_state; - uint32_t *d_ctx_state; - uint32_t *d_ctx_state2; - uint32_t *d_ctx_a; - uint32_t *d_ctx_b; - uint32_t *d_ctx_key1; - uint32_t *d_ctx_key2; - uint32_t *d_ctx_text; + uint32_t* d_result_count; + uint32_t* d_result_nonce; + uint32_t* d_long_state; + uint32_t* d_ctx_state; + uint32_t* d_ctx_state2; + uint32_t* d_ctx_a; + uint32_t* d_ctx_b; + uint32_t* d_ctx_key1; + uint32_t* d_ctx_key2; + uint32_t* d_ctx_text; std::string name; size_t free_device_memory; size_t total_device_memory; @@ -43,19 +44,20 @@ typedef struct { xmrstak_algo cached_algo = {xmrstak_algo_id::invalid_algo}; } nvid_ctx; -extern "C" { +extern "C" +{ -/** get device count + /** get device count * * @param deviceCount[out] cuda device count * @return error code: 0 == error is occurred, 1 == no error */ -int cuda_get_devicecount( int* deviceCount); -int cuda_get_deviceinfo(nvid_ctx *ctx); -int cryptonight_extra_cpu_init(nvid_ctx *ctx); -void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len); -void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo); -void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo); + int cuda_get_devicecount(int* deviceCount); + int cuda_get_deviceinfo(nvid_ctx* ctx); + int cryptonight_extra_cpu_init(nvid_ctx* ctx); + void cryptonight_extra_cpu_set_data(nvid_ctx* ctx, const void* data, uint32_t len); + void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo); + void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t* resnonce, const xmrstak_algo& miner_algo); } void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, uint32_t startNonce, uint64_t chain_height); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp index 199025635..d33e76715 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp @@ -3,287 +3,285 @@ #include -#define N_COLS 4 -#define WPOLY 0x011b +#define N_COLS 4 +#define WPOLY 0x011b static __constant__ uint32_t d_t_fn[1024] = -{ - 0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U, - 0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U, - 0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U, - 0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU, - 0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU, - 0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU, - 0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U, - 0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU, - 0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU, - 0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U, - 0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U, - 0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU, - 0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU, - 0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU, - 0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU, - 0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU, - 0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U, - 0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU, - 0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU, - 0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U, - 0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U, - 0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U, - 0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U, - 0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U, - 0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU, - 0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U, - 0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU, - 0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU, - 0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U, - 0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U, - 0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U, - 0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU, - 0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U, - 0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU, - 0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU, - 0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U, - 0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U, - 0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU, - 0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U, - 0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU, - 0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U, - 0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U, - 0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U, - 0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U, - 0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU, - 0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U, - 0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU, - 0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U, - 0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU, - 0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U, - 0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU, - 0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU, - 0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU, - 0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU, - 0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U, - 0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U, - 0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U, - 0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U, - 0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U, - 0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U, - 0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU, - 0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U, - 0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU, - 0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU, - 0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU, - 0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U, - 0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU, - 0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU, - 0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U, - 0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU, - 0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU, - 0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU, - 0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU, - 0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU, - 0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U, - 0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU, - 0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU, - 0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U, - 0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU, - 0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU, - 0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU, - 0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU, - 0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU, - 0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U, - 0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU, - 0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU, - 0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU, - 0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU, - 0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U, - 0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U, - 0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U, - 0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U, - 0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU, - 0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U, - 0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U, - 0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU, - 0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU, - 0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U, - 0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U, - 0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U, - 0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU, - 0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U, - 0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU, - 0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U, - 0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU, - 0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U, - 0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U, - 0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU, - 0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U, - 0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U, - 0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U, - 0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U, - 0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U, - 0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U, - 0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U, - 0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U, - 0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU, - 0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U, - 0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U, - 0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U, - 0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U, - 0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U, - 0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U, - 0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU, - 0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U, - 0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U, - 0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U, - 0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU, - 0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU, - 0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U, - 0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU, - 0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U, - 0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU, - 0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U, - 0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU, - 0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U, - 0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U, - 0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU, - 0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U, - 0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U, - 0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U, - 0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU, - 0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U, - 0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U, - 0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU, - 0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U, - 0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U, - 0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U, - 0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU, - 0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU, - 0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U, - 0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU, - 0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU, - 0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U, - 0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU, - 0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U, - 0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU, - 0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U, - 0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U, - 0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U, - 0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU, - 0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U, - 0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU, - 0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U, - 0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU, - 0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U, - 0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U, - 0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU, - 0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU, - 0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU, - 0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U, - 0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U, - 0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU, - 0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U, - 0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU, - 0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U, - 0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU, - 0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U, - 0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU, - 0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU, - 0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U, - 0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU, - 0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U, - 0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU, - 0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U, - 0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U, - 0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U, - 0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU, - 0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU, - 0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U, - 0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU, - 0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U, - 0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU, - 0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U, - 0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU, - 0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U, - 0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU, - 0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U, - 0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU, - 0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U, - 0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U, - 0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU, - 0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U, - 0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U, - 0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U, - 0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU, - 0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U, - 0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U, - 0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU, - 0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U, - 0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U, - 0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U, - 0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU, - 0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU, - 0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U, - 0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU, - 0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU, - 0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U, - 0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU, - 0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U, - 0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU, - 0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U, - 0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U, - 0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U, - 0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU, - 0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U, - 0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU, - 0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U, - 0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU, - 0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U, - 0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U, - 0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU, - 0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU, - 0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU, - 0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U, - 0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U, - 0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU, - 0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U, - 0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU, - 0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U, - 0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU, - 0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U, - 0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU, - 0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU, - 0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U, - 0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU, - 0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U, - 0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU, - 0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U, - 0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U, - 0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U, - 0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU, - 0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU, - 0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U, - 0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU, - 0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U -}; + { + 0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U, + 0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U, + 0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U, + 0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU, + 0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU, + 0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU, + 0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U, + 0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU, + 0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU, + 0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U, + 0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U, + 0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU, + 0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU, + 0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU, + 0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU, + 0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU, + 0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U, + 0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU, + 0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU, + 0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U, + 0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U, + 0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U, + 0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U, + 0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U, + 0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU, + 0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U, + 0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU, + 0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU, + 0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U, + 0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U, + 0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U, + 0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU, + 0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U, + 0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU, + 0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU, + 0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U, + 0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U, + 0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU, + 0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U, + 0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU, + 0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U, + 0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U, + 0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U, + 0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U, + 0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU, + 0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U, + 0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU, + 0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U, + 0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU, + 0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U, + 0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU, + 0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU, + 0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU, + 0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU, + 0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U, + 0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U, + 0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U, + 0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U, + 0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U, + 0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U, + 0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU, + 0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U, + 0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU, + 0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU, + 0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU, + 0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U, + 0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU, + 0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU, + 0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U, + 0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU, + 0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU, + 0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU, + 0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU, + 0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU, + 0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U, + 0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU, + 0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU, + 0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U, + 0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU, + 0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU, + 0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU, + 0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU, + 0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU, + 0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U, + 0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU, + 0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU, + 0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU, + 0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU, + 0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U, + 0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U, + 0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U, + 0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U, + 0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU, + 0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U, + 0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U, + 0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU, + 0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU, + 0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U, + 0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U, + 0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U, + 0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU, + 0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U, + 0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU, + 0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U, + 0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU, + 0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U, + 0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U, + 0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU, + 0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U, + 0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U, + 0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U, + 0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U, + 0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U, + 0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U, + 0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U, + 0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U, + 0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU, + 0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U, + 0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U, + 0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U, + 0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U, + 0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U, + 0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U, + 0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU, + 0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U, + 0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U, + 0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U, + 0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU, + 0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU, + 0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U, + 0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU, + 0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U, + 0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU, + 0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U, + 0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU, + 0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U, + 0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U, + 0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU, + 0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U, + 0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U, + 0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U, + 0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU, + 0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U, + 0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U, + 0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU, + 0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U, + 0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U, + 0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U, + 0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU, + 0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU, + 0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U, + 0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU, + 0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU, + 0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U, + 0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU, + 0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U, + 0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU, + 0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U, + 0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U, + 0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U, + 0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU, + 0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U, + 0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU, + 0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U, + 0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU, + 0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U, + 0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U, + 0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU, + 0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU, + 0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU, + 0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U, + 0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U, + 0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU, + 0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U, + 0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU, + 0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U, + 0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU, + 0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U, + 0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU, + 0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU, + 0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U, + 0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU, + 0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U, + 0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU, + 0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U, + 0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U, + 0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U, + 0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU, + 0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU, + 0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U, + 0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU, + 0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U, + 0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU, + 0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U, + 0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU, + 0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U, + 0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU, + 0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U, + 0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU, + 0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U, + 0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U, + 0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU, + 0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U, + 0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U, + 0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U, + 0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU, + 0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U, + 0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U, + 0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU, + 0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U, + 0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U, + 0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U, + 0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU, + 0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU, + 0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U, + 0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU, + 0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU, + 0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U, + 0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU, + 0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U, + 0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU, + 0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U, + 0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U, + 0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U, + 0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU, + 0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U, + 0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU, + 0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U, + 0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU, + 0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U, + 0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U, + 0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU, + 0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU, + 0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU, + 0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U, + 0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U, + 0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU, + 0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U, + 0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU, + 0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U, + 0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU, + 0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U, + 0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU, + 0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU, + 0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U, + 0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU, + 0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U, + 0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU, + 0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U, + 0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U, + 0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U, + 0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU, + 0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU, + 0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U, + 0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU, + 0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U}; -#define t_fn0(x) (sharedMemory[ (x)]) +#define t_fn0(x) (sharedMemory[(x)]) #define t_fn1(x) (sharedMemory[256 + (x)]) #define t_fn2(x) (sharedMemory[512 + (x)]) #define t_fn3(x) (sharedMemory[768 + (x)]) +#define round(dummy, y, x, k) \ + y[0] = (k)[0] ^ (t_fn0(x[0] & 0xff) ^ t_fn1((x[1] >> 8) & 0xff) ^ t_fn2((x[2] >> 16) & 0xff) ^ t_fn3((x[3] >> 24))); \ + y[1] = (k)[1] ^ (t_fn0(x[1] & 0xff) ^ t_fn1((x[2] >> 8) & 0xff) ^ t_fn2((x[3] >> 16) & 0xff) ^ t_fn3((x[0] >> 24))); \ + y[2] = (k)[2] ^ (t_fn0(x[2] & 0xff) ^ t_fn1((x[3] >> 8) & 0xff) ^ t_fn2((x[0] >> 16) & 0xff) ^ t_fn3((x[1] >> 24))); \ + y[3] = (k)[3] ^ (t_fn0(x[3] & 0xff) ^ t_fn1((x[0] >> 8) & 0xff) ^ t_fn2((x[1] >> 16) & 0xff) ^ t_fn3((x[2] >> 24))); -#define round(dummy,y,x,k) \ - y[0] = (k)[0] ^ (t_fn0(x[0] & 0xff) ^ t_fn1((x[1] >> 8) & 0xff) ^ t_fn2((x[2] >> 16) & 0xff) ^ t_fn3((x[3] >> 24))); \ - y[1] = (k)[1] ^ (t_fn0(x[1] & 0xff) ^ t_fn1((x[2] >> 8) & 0xff) ^ t_fn2((x[3] >> 16) & 0xff) ^ t_fn3((x[0] >> 24))); \ - y[2] = (k)[2] ^ (t_fn0(x[2] & 0xff) ^ t_fn1((x[3] >> 8) & 0xff) ^ t_fn2((x[0] >> 16) & 0xff) ^ t_fn3((x[1] >> 24))); \ - y[3] = (k)[3] ^ (t_fn0(x[3] & 0xff) ^ t_fn1((x[0] >> 8) & 0xff) ^ t_fn2((x[1] >> 16) & 0xff) ^ t_fn3((x[2] >> 24) )); - -__device__ __forceinline__ static void cn_aes_single_round(uint32_t * __restrict__ sharedMemory, const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t * __restrict__ expandedKey) +__device__ __forceinline__ static void cn_aes_single_round(uint32_t* __restrict__ sharedMemory, const uint32_t* __restrict__ in, uint32_t* __restrict__ out, const uint32_t* __restrict__ expandedKey) { round(sharedMemory, out, in, expandedKey); } -__device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t * __restrict__ sharedMemory, uint32_t * __restrict__ val, const uint32_t * __restrict__ expandedKey) +__device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t* __restrict__ sharedMemory, uint32_t* __restrict__ val, const uint32_t* __restrict__ expandedKey) { uint32_t b1[4]; round(sharedMemory, b1, val, expandedKey); @@ -298,14 +296,14 @@ __device__ __forceinline__ static void cn_aes_pseudo_round_mut(const uint32_t * round(sharedMemory, val, b1, expandedKey + 9 * N_COLS); } -__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory) +__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t* sharedMemory) { for(int i = threadIdx.x; i < 1024; i += blockDim.x) sharedMemory[i] = d_t_fn[i]; } -__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t *sharedMemory) +__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t* sharedMemory) { - for(int i = threadIdx.x; i < 512; i += blockDim.x) - sharedMemory[i] = d_t_fn[i]; + for(int i = threadIdx.x; i < 512; i += blockDim.x) + sharedMemory[i] = d_t_fn[i]; } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp index 611fe1c8c..efd57c944 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp @@ -1,64 +1,68 @@ #pragma once -typedef struct { +#include "cuda_extra.hpp" + +typedef struct +{ uint32_t h[8], s[4], t[2]; uint32_t buflen; int nullt; uint8_t buf[64]; } blake_state; -#define U8TO32(p) \ +#define U8TO32(p) \ (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ - ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) - -#define U32TO8(p, v) \ - (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ - (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); - -#define BLAKE_ROT(x,n) ROTR32(x, n) -#define BLAKE_G(a,b,c,d,e) \ - v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e+1]]) + v[b]; \ - v[d] = BLAKE_ROT(v[d] ^ v[a],16); \ - v[c] += v[d]; \ - v[b] = BLAKE_ROT(v[b] ^ v[c],12); \ - v[a] += (m[d_blake_sigma[i][e+1]] ^ d_blake_cst[d_blake_sigma[i][e]])+v[b]; \ - v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \ - v[c] += v[d]; \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]))) + +#define U32TO8(p, v) \ + (p)[0] = (uint8_t)((v) >> 24); \ + (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); \ + (p)[3] = (uint8_t)((v)); + +#define BLAKE_ROT(x, n) ROTR32(x, n) +#define BLAKE_G(a, b, c, d, e) \ + v[a] += (m[d_blake_sigma[i][e]] ^ d_blake_cst[d_blake_sigma[i][e + 1]]) + v[b]; \ + v[d] = BLAKE_ROT(v[d] ^ v[a], 16); \ + v[c] += v[d]; \ + v[b] = BLAKE_ROT(v[b] ^ v[c], 12); \ + v[a] += (m[d_blake_sigma[i][e + 1]] ^ d_blake_cst[d_blake_sigma[i][e]]) + v[b]; \ + v[d] = BLAKE_ROT(v[d] ^ v[a], 8); \ + v[c] += v[d]; \ v[b] = BLAKE_ROT(v[b] ^ v[c], 7); __constant__ uint8_t d_blake_sigma[14][16] = -{ - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, - {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, - {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, - {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, - {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, - {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, - {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, - {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, - {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8} -}; -__constant__ uint32_t d_blake_cst[16] -= { + { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}}; +__constant__ uint32_t d_blake_cst[16] = { 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89, 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, - 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 -}; + 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917}; -__device__ void cn_blake_compress(blake_state * S, const uint8_t * block) +__device__ void cn_blake_compress(blake_state* S, const uint8_t* block) { uint32_t v[16], m[16], i; - for (i = 0; i < 16; ++i) m[i] = U8TO32(block + i * 4); - for (i = 0; i < 8; ++i) v[i] = S->h[i]; - v[ 8] = S->s[0] ^ 0x243F6A88; - v[ 9] = S->s[1] ^ 0x85A308D3; + for(i = 0; i < 16; ++i) + m[i] = U8TO32(block + i * 4); + for(i = 0; i < 8; ++i) + v[i] = S->h[i]; + v[8] = S->s[0] ^ 0x243F6A88; + v[9] = S->s[1] ^ 0x85A308D3; v[10] = S->s[2] ^ 0x13198A2E; v[11] = S->s[3] ^ 0x03707344; v[12] = 0xA4093822; @@ -66,7 +70,7 @@ __device__ void cn_blake_compress(blake_state * S, const uint8_t * block) v[14] = 0x082EFA98; v[15] = 0xEC4E6C89; - if (S->nullt == 0) + if(S->nullt == 0) { v[12] ^= S->t[0]; v[13] ^= S->t[0]; @@ -74,50 +78,54 @@ __device__ void cn_blake_compress(blake_state * S, const uint8_t * block) v[15] ^= S->t[1]; } - for (i = 0; i < 14; ++i) + for(i = 0; i < 14; ++i) { - BLAKE_G(0, 4, 8, 12, 0); - BLAKE_G(1, 5, 9, 13, 2); - BLAKE_G(2, 6, 10, 14, 4); - BLAKE_G(3, 7, 11, 15, 6); - BLAKE_G(3, 4, 9, 14, 14); - BLAKE_G(2, 7, 8, 13, 12); - BLAKE_G(0, 5, 10, 15, 8); + BLAKE_G(0, 4, 8, 12, 0); + BLAKE_G(1, 5, 9, 13, 2); + BLAKE_G(2, 6, 10, 14, 4); + BLAKE_G(3, 7, 11, 15, 6); + BLAKE_G(3, 4, 9, 14, 14); + BLAKE_G(2, 7, 8, 13, 12); + BLAKE_G(0, 5, 10, 15, 8); BLAKE_G(1, 6, 11, 12, 10); } - for (i = 0; i < 16; ++i) S->h[i % 8] ^= v[i]; - for (i = 0; i < 8; ++i) S->h[i] ^= S->s[i % 4]; + for(i = 0; i < 16; ++i) + S->h[i % 8] ^= v[i]; + for(i = 0; i < 8; ++i) + S->h[i] ^= S->s[i % 4]; } -__device__ void cn_blake_update(blake_state * S, const uint8_t * data, uint64_t datalen) +__device__ void cn_blake_update(blake_state* S, const uint8_t* data, uint64_t datalen) { uint32_t left = S->buflen >> 3; uint32_t fill = 64 - left; - if (left && (((datalen >> 3) & 0x3F) >= fill)) + if(left && (((datalen >> 3) & 0x3F) >= fill)) { - memcpy((void *) (S->buf + left), (void *) data, fill); + memcpy((void*)(S->buf + left), (void*)data, fill); S->t[0] += 512; - if (S->t[0] == 0) S->t[1]++; + if(S->t[0] == 0) + S->t[1]++; cn_blake_compress(S, S->buf); data += fill; datalen -= (fill << 3); left = 0; } - while (datalen >= 512) + while(datalen >= 512) { S->t[0] += 512; - if (S->t[0] == 0) S->t[1]++; + if(S->t[0] == 0) + S->t[1]++; cn_blake_compress(S, data); data += 64; datalen -= 512; } - if (datalen > 0) + if(datalen > 0) { - memcpy((void *) (S->buf + left), (void *) data, datalen >> 3); + memcpy((void*)(S->buf + left), (void*)data, datalen >> 3); S->buflen = (left << 3) + datalen; } else @@ -126,31 +134,32 @@ __device__ void cn_blake_update(blake_state * S, const uint8_t * data, uint64_ } } -__device__ void cn_blake_final(blake_state * S, uint8_t * digest) +__device__ void cn_blake_final(blake_state* S, uint8_t* digest) { const uint8_t padding[] = - { - 0x80,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 - }; + { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; uint8_t pa = 0x81, pb = 0x01; uint8_t msglen[8]; uint32_t lo = S->t[0] + S->buflen, hi = S->t[1]; - if (lo < (unsigned) S->buflen) hi++; + if(lo < (unsigned)S->buflen) + hi++; U32TO8(msglen + 0, hi); U32TO8(msglen + 4, lo); - if (S->buflen == 440) + if(S->buflen == 440) { S->t[0] -= 8; cn_blake_update(S, &pa, 8); } else { - if (S->buflen < 440) + if(S->buflen < 440) { - if (S->buflen == 0) S->nullt = 1; + if(S->buflen == 0) + S->nullt = 1; S->t[0] -= 440 - S->buflen; cn_blake_update(S, padding, 440 - S->buflen); } @@ -168,9 +177,9 @@ __device__ void cn_blake_final(blake_state * S, uint8_t * digest) S->t[0] -= 64; cn_blake_update(S, msglen, 64); - U32TO8(digest + 0, S->h[0]); - U32TO8(digest + 4, S->h[1]); - U32TO8(digest + 8, S->h[2]); + U32TO8(digest + 0, S->h[0]); + U32TO8(digest + 4, S->h[1]); + U32TO8(digest + 8, S->h[2]); U32TO8(digest + 12, S->h[3]); U32TO8(digest + 16, S->h[4]); U32TO8(digest + 20, S->h[5]); @@ -178,17 +187,22 @@ __device__ void cn_blake_final(blake_state * S, uint8_t * digest) U32TO8(digest + 28, S->h[7]); } -__device__ void cn_blake(const uint8_t * in, uint64_t inlen, uint8_t * out) +__device__ void cn_blake(const uint8_t* in, uint64_t inlen, uint8_t* out) { blake_state bs; - blake_state *S = (blake_state *)&bs; - - S->h[0] = 0x6A09E667; S->h[1] = 0xBB67AE85; S->h[2] = 0x3C6EF372; - S->h[3] = 0xA54FF53A; S->h[4] = 0x510E527F; S->h[5] = 0x9B05688C; - S->h[6] = 0x1F83D9AB; S->h[7] = 0x5BE0CD19; + blake_state* S = (blake_state*)&bs; + + S->h[0] = 0x6A09E667; + S->h[1] = 0xBB67AE85; + S->h[2] = 0x3C6EF372; + S->h[3] = 0xA54FF53A; + S->h[4] = 0x510E527F; + S->h[5] = 0x9B05688C; + S->h[6] = 0x1F83D9AB; + S->h[7] = 0x5BE0CD19; S->t[0] = S->t[1] = S->buflen = S->nullt = 0; S->s[0] = S->s[1] = S->s[2] = S->s[3] = 0; - cn_blake_update(S, (uint8_t *)in, inlen * 8); - cn_blake_final(S, (uint8_t *)out); + cn_blake_update(S, (uint8_t*)in, inlen * 8); + cn_blake_final(S, (uint8_t*)out); } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 718cff0c7..7f610f9dc 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -1,55 +1,55 @@ #include "xmrstak/backend/cryptonight.hpp" -#include -#include -#include +#include #include #include -#include +#include +#include +#include -#include "xmrstak/jconf.hpp" -#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" -#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp" -#include "xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp" #include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp" - +#include "xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp" +#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp" +#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" +#include "xmrstak/jconf.hpp" #ifdef _WIN32 #include extern "C" void compat_usleep(uint64_t waitTime) { - if (waitTime > 0) - { - if (waitTime > 100) - { - // use a waitable timer for larger intervals > 0.1ms - - HANDLE timer; - LARGE_INTEGER ft; - - ft.QuadPart = -10ll * int64_t(waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time - - timer = CreateWaitableTimer(NULL, TRUE, NULL); - SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0); - WaitForSingleObject(timer, INFINITE); - CloseHandle(timer); - } - else - { - // use a polling loop for short intervals <= 100ms - - LARGE_INTEGER perfCnt, start, now; - __int64 elapsed; - - QueryPerformanceFrequency(&perfCnt); - QueryPerformanceCounter(&start); - do { - SwitchToThread(); - QueryPerformanceCounter((LARGE_INTEGER*) &now); - elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000); - } while ( elapsed < waitTime ); - } - } + if(waitTime > 0) + { + if(waitTime > 100) + { + // use a waitable timer for larger intervals > 0.1ms + + HANDLE timer; + LARGE_INTEGER ft; + + ft.QuadPart = -10ll * int64_t(waitTime); // Convert to 100 nanosecond interval, negative value indicates relative time + + timer = CreateWaitableTimer(NULL, TRUE, NULL); + SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0); + WaitForSingleObject(timer, INFINITE); + CloseHandle(timer); + } + else + { + // use a polling loop for short intervals <= 100ms + + LARGE_INTEGER perfCnt, start, now; + __int64 elapsed; + + QueryPerformanceFrequency(&perfCnt); + QueryPerformanceCounter(&start); + do + { + SwitchToThread(); + QueryPerformanceCounter((LARGE_INTEGER*)&now); + elapsed = (__int64)((now.QuadPart - start.QuadPart) / (float)perfCnt.QuadPart * 1000 * 1000); + } while(elapsed < waitTime); + } + } } #else #include @@ -60,9 +60,9 @@ extern "C" void compat_usleep(uint64_t waitTime) #endif #include "cryptonight.hpp" -#include "cuda_extra.hpp" #include "cuda_aes.hpp" #include "cuda_device.hpp" +#include "cuda_extra.hpp" /* sm_2X is limited to 2GB due to the small TLB * therefore we never use 64bit indices @@ -73,106 +73,113 @@ typedef uint64_t IndexType; typedef int IndexType; #endif -__device__ __forceinline__ uint64_t cuda_mul128( uint64_t multiplier, uint64_t multiplicand, uint64_t& product_hi ) +__device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t& product_hi) { - product_hi = __umul64hi( multiplier, multiplicand ); - return (multiplier * multiplicand ); + product_hi = __umul64hi(multiplier, multiplicand); + return (multiplier * multiplicand); } -template< typename T > -__device__ __forceinline__ T loadGlobal64( T * const addr ) +template +__device__ __forceinline__ T loadGlobal64(T* const addr) { -#if (__CUDA_ARCH__ < 700) +#if(__CUDA_ARCH__ < 700) T x; - asm volatile( "ld.global.cg.u64 %0, [%1];" : "=l"( x ) : "l"( addr ) ); + asm volatile("ld.global.cg.u64 %0, [%1];" + : "=l"(x) + : "l"(addr)); return x; #else return *addr; #endif } -template< typename T > -__device__ __forceinline__ T loadGlobal32( T * const addr ) +template +__device__ __forceinline__ T loadGlobal32(T* const addr) { -#if (__CUDA_ARCH__ < 700) +#if(__CUDA_ARCH__ < 700) T x; - asm volatile( "ld.global.cg.u32 %0, [%1];" : "=r"( x ) : "l"( addr ) ); + asm volatile("ld.global.cg.u32 %0, [%1];" + : "=r"(x) + : "l"(addr)); return x; #else return *addr; #endif } - -template< typename T > -__device__ __forceinline__ void storeGlobal32( T* addr, T const & val ) +template +__device__ __forceinline__ void storeGlobal32(T* addr, T const& val) { -#if (__CUDA_ARCH__ < 700) - asm volatile( "st.global.cg.u32 [%0], %1;" : : "l"( addr ), "r"( val ) ); +#if(__CUDA_ARCH__ < 700) + asm volatile("st.global.cg.u32 [%0], %1;" + : + : "l"(addr), "r"(val)); #else *addr = val; #endif } -template< typename T > -__device__ __forceinline__ void storeGlobal64( T* addr, T const & val ) +template +__device__ __forceinline__ void storeGlobal64(T* addr, T const& val) { -#if (__CUDA_ARCH__ < 700) - asm volatile( "st.global.cg.u64 [%0], %1;" : : "l"( addr ), "l"( val ) ); +#if(__CUDA_ARCH__ < 700) + asm volatile("st.global.cg.u64 [%0], %1;" + : + : "l"(addr), "l"(val)); #else *addr = val; #endif } -__device__ __forceinline__ uint32_t rotate16( const uint32_t n ) +__device__ __forceinline__ uint32_t rotate16(const uint32_t n) { return (n >> 16u) | (n << 16u); } __global__ void cryptonight_core_gpu_phase1( - const uint32_t ITERATIONS, const size_t MEMORY, - int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 ) + const uint32_t ITERATIONS, const size_t MEMORY, + int threads, int bfactor, int partidx, uint32_t* __restrict__ long_state, uint32_t* __restrict__ ctx_state2, uint32_t* __restrict__ ctx_key1) { __shared__ uint32_t sharedMemory[1024]; - cn_aes_gpu_init( sharedMemory ); - __syncthreads( ); + cn_aes_gpu_init(sharedMemory); + __syncthreads(); - const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3; - const int sub = ( threadIdx.x & 7 ) << 2; + const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; + const int sub = (threadIdx.x & 7) << 2; const int batchsize = MEMORY >> bfactor; const int start = partidx * batchsize; const int end = start + batchsize; - if ( thread >= threads ) + if(thread >= threads) return; uint32_t key[40], text[4]; - MEMCPY8( key, ctx_key1 + thread * 40, 20 ); + MEMCPY8(key, ctx_key1 + thread * 40, 20); - if( partidx == 0 ) + if(partidx == 0) { // first round - MEMCPY8( text, ctx_state2 + thread * 50 + sub + 16, 2 ); + MEMCPY8(text, ctx_state2 + thread * 50 + sub + 16, 2); } else { // load previous text data - MEMCPY8( text, &long_state[( (uint64_t) thread * MEMORY ) + sub + start - 32], 2 ); + MEMCPY8(text, &long_state[((uint64_t)thread * MEMORY) + sub + start - 32], 2); } - __syncthreads( ); - for ( int i = start; i < end; i += 32 ) + __syncthreads(); + for(int i = start; i < end; i += 32) { - cn_aes_pseudo_round_mut( sharedMemory, text, key ); - MEMCPY8(&long_state[((uint64_t) thread * MEMORY) + (sub + i)], text, 2); + cn_aes_pseudo_round_mut(sharedMemory, text, key); + MEMCPY8(&long_state[((uint64_t)thread * MEMORY) + (sub + i)], text, 2); } } /** avoid warning `unused parameter` */ -template< typename T > -__forceinline__ __device__ void unusedVar( const T& ) +template +__forceinline__ __device__ void unusedVar(const T&) { } @@ -189,25 +196,25 @@ __forceinline__ __device__ void unusedVar( const T& ) * @param value value to share with other threads within the group * @param src thread number within the group from where the data is read, range [0:group_n] */ -template -__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src) +template +__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr, const uint32_t sub, const int val, const uint32_t src) { -#if( __CUDA_ARCH__ < 300 ) - ptr[sub] = val; - return ptr[src & (group_n-1)]; +#if(__CUDA_ARCH__ < 300) + ptr[sub] = val; + return ptr[src & (group_n - 1)]; +#else + unusedVar(ptr); + unusedVar(sub); +#if(__CUDACC_VER_MAJOR__ >= 9) + return __shfl_sync(__activemask(), val, src, group_n); #else - unusedVar( ptr ); - unusedVar( sub ); -# if(__CUDACC_VER_MAJOR__ >= 9) - return __shfl_sync(__activemask(), val, src, group_n ); -# else - return __shfl( val, src, group_n ); -# endif + return __shfl(val, src, group_n); +#endif #endif } -template -__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src, const uint32_t src2) +template +__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr, const uint32_t sub, const int val, const uint32_t src, const uint32_t src2) { uint64_t tmp; ((uint32_t*)&tmp)[0] = shuffle(ptr, sub, val, src); @@ -218,9 +225,9 @@ __forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint3 struct u64 : public uint2 { - __forceinline__ __device__ u64(){} + __forceinline__ __device__ u64() {} - __forceinline__ __device__ u64( const uint32_t x0, const uint32_t x1) + __forceinline__ __device__ u64(const uint32_t x0, const uint32_t x1) { uint2::x = x0; uint2::y = x1; @@ -231,7 +238,7 @@ struct u64 : public uint2 return *((uint64_t*)this); } - __forceinline__ __device__ u64( const uint64_t x0) + __forceinline__ __device__ u64(const uint64_t x0) { ((uint64_t*)&this->x)[0] = x0; } @@ -259,7 +266,7 @@ struct u64 : public uint2 __forceinline__ __device__ void print(int i) const { - if(i<2) + if(i < 2) printf("gpu: %lu\n", ((uint64_t*)&this->x)[0]); } }; @@ -269,42 +276,42 @@ struct u64 : public uint2 * @tparam MEM_MODE if `0` than 64bit memory transfers per thread will be used to store/load data within shared memory * else if `1` 256bit operations will be used */ -template +template #ifdef XMR_STAK_THREADS -__launch_bounds__( XMR_STAK_THREADS * 2 ) +__launch_bounds__(XMR_STAK_THREADS * 2) #endif -__global__ void cryptonight_core_gpu_phase2_double( - const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, - int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, - uint32_t startNonce, uint32_t * __restrict__ d_input ) + __global__ void cryptonight_core_gpu_phase2_double( + const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, + int threads, int bfactor, int partidx, uint32_t* d_long_state, uint32_t* d_ctx_a, uint32_t* d_ctx_b, uint32_t* d_ctx_state, + uint32_t startNonce, uint32_t* __restrict__ d_input) { __shared__ uint32_t sharedMemory[512]; - cn_aes_gpu_init_half( sharedMemory ); + cn_aes_gpu_init_half(sharedMemory); -#if( __CUDA_ARCH__ < 300 ) +#if(__CUDA_ARCH__ < 300) extern __shared__ uint64_t externShared[]; // 8 x 64bit values volatile uint64_t* myChunks = (volatile uint64_t*)(externShared + (threadIdx.x >> 1) * 8); - volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8) + (threadIdx.x & 0xFFFFFFFE); + volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8) + (threadIdx.x & 0xFFFFFFFE); #else extern __shared__ uint64_t chunkMem[]; - volatile uint32_t* sPtr = NULL; + volatile uint32_t* sPtr = NULL; // 8 x 64bit values volatile uint64_t* myChunks = (volatile uint64_t*)(chunkMem + (threadIdx.x >> 1) * 8); #endif - __syncthreads( ); + __syncthreads(); const uint64_t tid = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = tid >> 1; const uint32_t sub = tid & 1; - if ( thread >= threads ) + if(thread >= threads) return; - uint8_t *l0 = (uint8_t*)&d_long_state[(IndexType) thread * MEMORY]; + uint8_t* l0 = (uint8_t*)&d_long_state[(IndexType)thread * MEMORY]; uint64_t ax0 = ((uint64_t*)(d_ctx_a + thread * 4))[sub]; uint64_t bx0; @@ -324,22 +331,22 @@ __global__ void cryptonight_core_gpu_phase2_double( sqrt_result = (d_ctx_b + thread * 16 + 4 * 2 + 2)[0]; } else - bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub]; + bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub]; - const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); + const int batchsize = (ITERATIONS * 2) >> (1 + bfactor); const int start = partidx * batchsize; const int end = start + batchsize; for(int i = start; i < end; ++i) { - ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; + ptr0 = (uint64_t*)&l0[idx0 & MASK & 0x1FFFC0]; if(MEM_MODE == 0) { - #pragma unroll 4 +#pragma unroll 4 for(int x = 0; x < 8; x += 2) { - myChunks[x + sub] = ptr0[ x + sub ]; + myChunks[x + sub] = ptr0[x + sub]; } } else @@ -347,52 +354,51 @@ __global__ void cryptonight_core_gpu_phase2_double( uint32_t idx1 = (idx0 & 0x30) >> 3; - const u64 cx = myChunks[ idx1 + sub ]; - const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ]; + const u64 cx = myChunks[idx1 + sub]; + const u64 cx2 = myChunks[idx1 + ((sub + 1) & 1)]; u64 cx_aes = ax0 ^ u64( - t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.x >> 16) & 0xff ) ^ t_fn1( (cx2.y >> 24 ) )), - t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.y >> 16) & 0xff ) ^ t_fn1( (cx.x >> 24 ) )) - ); + t_fn0(cx.x & 0xff) ^ t_fn1((cx.y >> 8) & 0xff) ^ rotate16(t_fn0((cx2.x >> 16) & 0xff) ^ t_fn1((cx2.y >> 24))), + t_fn0(cx.y & 0xff) ^ t_fn1((cx2.x >> 8) & 0xff) ^ rotate16(t_fn0((cx2.y >> 16) & 0xff) ^ t_fn1((cx.x >> 24)))); if(ALGO == cryptonight_monero_v8) { - const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ]; - const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; - const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ]; -#if (__CUDACC_VER_MAJOR__ >= 9) + const uint64_t chunk1 = myChunks[idx1 ^ 2 + sub]; + const uint64_t chunk2 = myChunks[idx1 ^ 4 + sub]; + const uint64_t chunk3 = myChunks[idx1 ^ 6 + sub]; +#if(__CUDACC_VER_MAJOR__ >= 9) __syncwarp(); #else - __syncthreads( ); + __syncthreads(); #endif - myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; - myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; - myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + myChunks[idx1 ^ 2 + sub] = chunk3 + bx1; + myChunks[idx1 ^ 4 + sub] = chunk1 + bx0; + myChunks[idx1 ^ 6 + sub] = chunk2 + ax0; } else if(ALGO == cryptonight_v8_reversewaltz) { - const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ]; - const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; - const uint64_t chunk1 = myChunks[ idx1 ^ 6 + sub ]; -#if (__CUDACC_VER_MAJOR__ >= 9) + const uint64_t chunk3 = myChunks[idx1 ^ 2 + sub]; + const uint64_t chunk2 = myChunks[idx1 ^ 4 + sub]; + const uint64_t chunk1 = myChunks[idx1 ^ 6 + sub]; +#if(__CUDACC_VER_MAJOR__ >= 9) __syncwarp(); #else - __syncthreads( ); + __syncthreads(); #endif - myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; - myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; - myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + myChunks[idx1 ^ 2 + sub] = chunk3 + bx1; + myChunks[idx1 ^ 4 + sub] = chunk1 + bx0; + myChunks[idx1 ^ 6 + sub] = chunk2 + ax0; } - myChunks[ idx1 + sub ] = cx_aes ^ bx0; + myChunks[idx1 + sub] = cx_aes ^ bx0; if(MEM_MODE == 0) { - #pragma unroll 4 +#pragma unroll 4 for(int x = 0; x < 8; x += 2) { - ptr0[ x + sub ] = myChunks[x + sub]; + ptr0[x + sub] = myChunks[x + sub]; } } else @@ -400,14 +406,14 @@ __global__ void cryptonight_core_gpu_phase2_double( idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0); idx1 = (idx0 & 0x30) >> 3; - ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0]; + ptr0 = (uint64_t*)&l0[idx0 & MASK & 0x1FFFC0]; if(MEM_MODE == 0) { - #pragma unroll 4 +#pragma unroll 4 for(int x = 0; x < 8; x += 2) { - myChunks[x + sub] = ptr0[ x + sub ]; + myChunks[x + sub] = ptr0[x + sub]; } } else @@ -417,15 +423,15 @@ __global__ void cryptonight_core_gpu_phase2_double( bx0 = cx_aes; uint64_t cx_mul; - ((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0); - ((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0); + ((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x, 0); + ((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y, 0); if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && sub == 1) { // Use division and square root results from the _previous_ iteration to hide the latency ((uint32_t*)&division_result)[1] ^= sqrt_result; - ((uint64_t*)myChunks)[ idx1 ] ^= division_result; + ((uint64_t*)myChunks)[idx1] ^= division_result; const uint32_t dd = (static_cast(cx_mul) + (sqrt_result << 1)) | 0x80000001UL; division_result = fast_div_v2(cx_aes, dd); @@ -433,46 +439,46 @@ __global__ void cryptonight_core_gpu_phase2_double( // Use division_result as an input for the square root to prevent parallel implementation in hardware sqrt_result = fast_sqrt_v2(cx_mul + division_result); } -#if (__CUDACC_VER_MAJOR__ >= 9) - __syncwarp(); +#if(__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); #else - __syncthreads( ); + __syncthreads(); #endif - uint64_t c = ((uint64_t*)myChunks)[ idx1 + sub ]; + uint64_t c = ((uint64_t*)myChunks)[idx1 + sub]; { - uint64_t cl = ((uint64_t*)myChunks)[ idx1 ]; + uint64_t cl = ((uint64_t*)myChunks)[idx1]; // sub 0 -> hi, sub 1 -> lo - uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl; + uint64_t res = sub == 0 ? __umul64hi(cx_mul, cl) : cx_mul * cl; if(ALGO == cryptonight_monero_v8) { - const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ] ^ res; - uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; + const uint64_t chunk1 = myChunks[idx1 ^ 2 + sub] ^ res; + uint64_t chunk2 = myChunks[idx1 ^ 4 + sub]; res ^= ((uint64_t*)&chunk2)[0]; - const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ]; -#if (__CUDACC_VER_MAJOR__ >= 9) + const uint64_t chunk3 = myChunks[idx1 ^ 6 + sub]; +#if(__CUDACC_VER_MAJOR__ >= 9) __syncwarp(); #else - __syncthreads( ); + __syncthreads(); #endif - myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; - myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; - myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + myChunks[idx1 ^ 2 + sub] = chunk3 + bx1; + myChunks[idx1 ^ 4 + sub] = chunk1 + bx0; + myChunks[idx1 ^ 6 + sub] = chunk2 + ax0; } if(ALGO == cryptonight_v8_reversewaltz) { - const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ] ^ res; - uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; + const uint64_t chunk3 = myChunks[idx1 ^ 2 + sub] ^ res; + uint64_t chunk2 = myChunks[idx1 ^ 4 + sub]; res ^= ((uint64_t*)&chunk2)[0]; - const uint64_t chunk1 = myChunks[ idx1 ^ 6 + sub ]; -#if (__CUDACC_VER_MAJOR__ >= 9) + const uint64_t chunk1 = myChunks[idx1 ^ 6 + sub]; +#if(__CUDACC_VER_MAJOR__ >= 9) __syncwarp(); #else - __syncthreads( ); + __syncthreads(); #endif - myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; - myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; - myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + myChunks[idx1 ^ 2 + sub] = chunk3 + bx1; + myChunks[idx1 ^ 4 + sub] = chunk1 + bx0; + myChunks[idx1 ^ 6 + sub] = chunk2 + ax0; } ax0 += res; } @@ -481,13 +487,13 @@ __global__ void cryptonight_core_gpu_phase2_double( bx1 = bx0; bx0 = cx_aes; } - myChunks[ idx1 + sub ] = ax0; + myChunks[idx1 + sub] = ax0; if(MEM_MODE == 0) { - #pragma unroll 4 +#pragma unroll 4 for(int x = 0; x < 8; x += 2) { - ptr0[ x + sub ] = myChunks[x + sub]; + ptr0[x + sub] = myChunks[x + sub]; } } else @@ -496,7 +502,7 @@ __global__ void cryptonight_core_gpu_phase2_double( idx0 = shuffle<2>(sPtr, sub, static_cast(ax0), 0); } - if ( bfactor > 0 ) + if(bfactor > 0) { ((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0; if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) @@ -516,41 +522,41 @@ __global__ void cryptonight_core_gpu_phase2_double( } } -template +template #ifdef XMR_STAK_THREADS -__launch_bounds__( XMR_STAK_THREADS * 4 ) +__launch_bounds__(XMR_STAK_THREADS * 4) #endif -__global__ void cryptonight_core_gpu_phase2_quad( - const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, - int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, - uint32_t startNonce, uint32_t * __restrict__ d_input ) + __global__ void cryptonight_core_gpu_phase2_quad( + const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, + int threads, int bfactor, int partidx, uint32_t* d_long_state, uint32_t* d_ctx_a, uint32_t* d_ctx_b, uint32_t* d_ctx_state, + uint32_t startNonce, uint32_t* __restrict__ d_input) { __shared__ uint32_t sharedMemory[1024]; - cn_aes_gpu_init( sharedMemory ); + cn_aes_gpu_init(sharedMemory); - __syncthreads( ); + __syncthreads(); - const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 2; + const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; const uint32_t nonce = startNonce + thread; const int sub = threadIdx.x & 3; const int sub2 = sub & 2; -#if( __CUDA_ARCH__ < 300 ) - extern __shared__ uint32_t shuffleMem[]; - volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFFC)); +#if(__CUDA_ARCH__ < 300) + extern __shared__ uint32_t shuffleMem[]; + volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x & 0xFFFFFFFC)); #else - volatile uint32_t* sPtr = NULL; + volatile uint32_t* sPtr = NULL; #endif - if ( thread >= threads ) + if(thread >= threads) return; int i, k; uint32_t j; - const int batchsize = (ITERATIONS * 2) >> ( 2 + bfactor ); + const int batchsize = (ITERATIONS * 2) >> (2 + bfactor); const int start = partidx * batchsize; const int end = start + batchsize; - uint32_t * long_state = &d_long_state[(IndexType) thread * MEMORY]; + uint32_t* long_state = &d_long_state[(IndexType)thread * MEMORY]; uint32_t a, d[2], idx0; uint32_t t1[2], t2[2], res; @@ -564,9 +570,9 @@ __global__ void cryptonight_core_gpu_phase2_quad( } uint32_t tweak1_2[2]; - if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) { - uint32_t * state = d_ctx_state + thread * 50; + uint32_t* state = d_ctx_state + thread * 50; tweak1_2[0] = (d_input[8] >> 24) | (d_input[9] << 8); tweak1_2[0] ^= state[48]; tweak1_2[1] = nonce; @@ -574,7 +580,7 @@ __global__ void cryptonight_core_gpu_phase2_quad( } a = (d_ctx_a + thread * 4)[sub]; - idx0 = shuffle<4>(sPtr,sub, a, 0); + idx0 = shuffle<4>(sPtr, sub, a, 0); if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { if(partidx != 0) @@ -585,33 +591,33 @@ __global__ void cryptonight_core_gpu_phase2_quad( } d[1] = (d_ctx_b + thread * 4)[sub]; - #pragma unroll 2 - for ( i = start; i < end; ++i ) +#pragma unroll 2 + for(i = start; i < end; ++i) { - #pragma unroll 2 - for ( int x = 0; x < 2; ++x ) +#pragma unroll 2 + for(int x = 0; x < 2; ++x) { - j = ( ( idx0 & MASK ) >> 2 ) + sub; + j = ((idx0 & MASK) >> 2) + sub; if(ALGO == cryptonight_bittube2) { uint32_t k[4]; - k[0] = ~loadGlobal32( long_state + j ); - k[1] = shuffle<4>(sPtr,sub, k[0], sub + 1); - k[2] = shuffle<4>(sPtr,sub, k[0], sub + 2); - k[3] = shuffle<4>(sPtr,sub, k[0], sub + 3); + k[0] = ~loadGlobal32(long_state + j); + k[1] = shuffle<4>(sPtr, sub, k[0], sub + 1); + k[2] = shuffle<4>(sPtr, sub, k[0], sub + 2); + k[3] = shuffle<4>(sPtr, sub, k[0], sub + 3); - #pragma unroll 4 +#pragma unroll 4 for(int i = 0; i < 4; ++i) { // only calculate the key if all data are up to date if(i == sub) { d[x] = a ^ - t_fn0( k[0] & 0xff ) ^ - t_fn1( (k[1] >> 8) & 0xff ) ^ - t_fn2( (k[2] >> 16) & 0xff ) ^ - t_fn3( (k[3] >> 24 ) ); + t_fn0(k[0] & 0xff) ^ + t_fn1((k[1] >> 8) & 0xff) ^ + t_fn2((k[2] >> 16) & 0xff) ^ + t_fn3((k[3] >> 24)); } // the last shuffle is not needed if(i != 3) @@ -619,13 +625,13 @@ __global__ void cryptonight_core_gpu_phase2_quad( /* avoid negative number for modulo * load valid key (k) depending on the round */ - k[(4 - sub + i)%4] = shuffle<4>(sPtr,sub, k[0] ^ d[x], i); + k[(4 - sub + i) % 4] = shuffle<4>(sPtr, sub, k[0] ^ d[x], i); } } } else { - uint32_t x_0 = loadGlobal32( long_state + j ); + uint32_t x_0 = loadGlobal32(long_state + j); if(ALGO == cryptonight_conceal) { @@ -642,18 +648,18 @@ __global__ void cryptonight_core_gpu_phase2_quad( x_0 = (uint32_t)(((int32_t)x_0) ^ ((int32_t)c_old)); } - const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1); - const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2); - const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3); + const uint32_t x_1 = shuffle<4>(sPtr, sub, x_0, sub + 1); + const uint32_t x_2 = shuffle<4>(sPtr, sub, x_0, sub + 2); + const uint32_t x_3 = shuffle<4>(sPtr, sub, x_0, sub + 3); d[x] = a ^ - t_fn0( x_0 & 0xff ) ^ - t_fn1( (x_1 >> 8) & 0xff ) ^ - t_fn2( (x_2 >> 16) & 0xff ) ^ - t_fn3( ( x_3 >> 24 ) ); + t_fn0(x_0 & 0xff) ^ + t_fn1((x_1 >> 8) & 0xff) ^ + t_fn2((x_2 >> 16) & 0xff) ^ + t_fn3((x_3 >> 24)); } //XOR_BLOCKS_DST(c, b, &long_state[j]); - t1[0] = shuffle<4>(sPtr,sub, d[x], 0); + t1[0] = shuffle<4>(sPtr, sub, d[x], 0); const uint32_t z = d[0] ^ d[1]; if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) @@ -663,157 +669,157 @@ __global__ void cryptonight_core_gpu_phase2_quad( { const uint32_t index = ((z >> 26) & 12) | ((z >> 23) & 2); const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24; - storeGlobal32( long_state + j, sub == 2 ? fork_7 : z ); + storeGlobal32(long_state + j, sub == 2 ? fork_7 : z); } else if(ALGO == cryptonight_stellite) { const uint32_t index = ((z >> 27) & 12) | ((z >> 23) & 2); const uint32_t fork_7 = z ^ ((table >> index) & 0x30U) << 24; - storeGlobal32( long_state + j, sub == 2 ? fork_7 : z ); + storeGlobal32(long_state + j, sub == 2 ? fork_7 : z); } } else - storeGlobal32( long_state + j, z ); + storeGlobal32(long_state + j, z); //MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & MASK]); - j = ( ( *t1 & MASK ) >> 2 ) + sub; + j = ((*t1 & MASK) >> 2) + sub; uint32_t yy[2]; - *( (uint64_t*) yy ) = loadGlobal64( ( (uint64_t *) long_state )+( j >> 1 ) ); + *((uint64_t*)yy) = loadGlobal64(((uint64_t*)long_state) + (j >> 1)); uint32_t zz[2]; - zz[0] = shuffle<4>(sPtr,sub, yy[0], 0); - zz[1] = shuffle<4>(sPtr,sub, yy[1], 0); + zz[0] = shuffle<4>(sPtr, sub, yy[0], 0); + zz[1] = shuffle<4>(sPtr, sub, yy[1], 0); - t1[1] = shuffle<4>(sPtr,sub, d[x], 1); - #pragma unroll - for ( k = 0; k < 2; k++ ) - t2[k] = shuffle<4>(sPtr,sub, a, k + sub2); + t1[1] = shuffle<4>(sPtr, sub, d[x], 1); +#pragma unroll + for(k = 0; k < 2; k++) + t2[k] = shuffle<4>(sPtr, sub, a, k + sub2); - *( (uint64_t *) t2 ) += sub2 ? ( *( (uint64_t *) t1 ) * *( (uint64_t*) zz ) ) : __umul64hi( *( (uint64_t *) t1 ), *( (uint64_t*) zz ) ); + *((uint64_t*)t2) += sub2 ? (*((uint64_t*)t1) * *((uint64_t*)zz)) : __umul64hi(*((uint64_t*)t1), *((uint64_t*)zz)); - res = *( (uint64_t *) t2 ) >> ( sub & 1 ? 32 : 0 ); + res = *((uint64_t*)t2) >> (sub & 1 ? 32 : 0); if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) { const uint32_t tweaked_res = tweak1_2[sub & 1] ^ res; uint32_t long_state_update = sub2 ? tweaked_res : res; - if (ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2) { - uint32_t value = shuffle<4>(sPtr,sub, long_state_update, sub & 1) ^ long_state_update; + uint32_t value = shuffle<4>(sPtr, sub, long_state_update, sub & 1) ^ long_state_update; long_state_update = sub >= 2 ? value : long_state_update; } - storeGlobal32( long_state + j, long_state_update ); + storeGlobal32(long_state + j, long_state_update); } else - storeGlobal32( long_state + j, res ); + storeGlobal32(long_state + j, res); - a = ( sub & 1 ? yy[1] : yy[0] ) ^ res; - idx0 = shuffle<4>(sPtr,sub, a, 0); + a = (sub & 1 ? yy[1] : yy[0]) ^ res; + idx0 = shuffle<4>(sPtr, sub, a, 0); if(ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2) { - int64_t n = loadGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3)); - int32_t d = loadGlobal32( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u )); + int64_t n = loadGlobal64(((uint64_t*)long_state) + ((idx0 & MASK) >> 3)); + int32_t d = loadGlobal32((uint32_t*)(((uint64_t*)long_state) + ((idx0 & MASK) >> 3) + 1u)); int64_t q = fast_div_heavy(n, (d | 0x5)); - if(sub&1) - storeGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q ); + if(sub & 1) + storeGlobal64(((uint64_t*)long_state) + ((idx0 & MASK) >> 3), n ^ q); idx0 = d ^ q; } else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast) { - int64_t n = loadGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3)); - int32_t d = loadGlobal32( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u )); + int64_t n = loadGlobal64(((uint64_t*)long_state) + ((idx0 & MASK) >> 3)); + int32_t d = loadGlobal32((uint32_t*)(((uint64_t*)long_state) + ((idx0 & MASK) >> 3) + 1u)); int64_t q = fast_div_heavy(n, (d | 0x5)); - if(sub&1) - storeGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q ); + if(sub & 1) + storeGlobal64(((uint64_t*)long_state) + ((idx0 & MASK) >> 3), n ^ q); idx0 = (~d) ^ q; } } } - if ( bfactor > 0 ) + if(bfactor > 0) { (d_ctx_a + thread * 4)[sub] = a; (d_ctx_b + thread * 4)[sub] = d[1]; if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) - if(sub&1) + if(sub & 1) *(d_ctx_b + threads * 4 + thread) = idx0; if(ALGO == cryptonight_conceal) *(d_ctx_b + threads * 4 + thread * 4 + sub) = float_as_int(conc_var); } } -template +template __global__ void cryptonight_core_gpu_phase3( - const uint32_t ITERATIONS, const size_t MEMORY, - int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 ) + const uint32_t ITERATIONS, const size_t MEMORY, + int threads, int bfactor, int partidx, const uint32_t* __restrict__ long_state, uint32_t* __restrict__ d_ctx_state, uint32_t* __restrict__ d_ctx_key2) { __shared__ uint32_t sharedMemory[1024]; - cn_aes_gpu_init( sharedMemory ); - __syncthreads( ); + cn_aes_gpu_init(sharedMemory); + __syncthreads(); - int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3; - int subv = ( threadIdx.x & 7 ); + int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; + int subv = (threadIdx.x & 7); int sub = subv << 2; const int batchsize = MEMORY >> bfactor; const int start = (partidx % (1 << bfactor)) * batchsize; const int end = start + batchsize; - if ( thread >= threads ) + if(thread >= threads) return; uint32_t key[40], text[4]; - MEMCPY8( key, d_ctx_key2 + thread * 40, 20 ); - MEMCPY8( text, d_ctx_state + thread * 50 + sub + 16, 2 ); + MEMCPY8(key, d_ctx_key2 + thread * 40, 20); + MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2); - __syncthreads( ); + __syncthreads(); -#if( __CUDA_ARCH__ < 300 ) +#if(__CUDA_ARCH__ < 300) extern __shared__ uint32_t shuffleMem[]; - volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x& 0xFFFFFFF8)); + volatile uint32_t* sPtr = (volatile uint32_t*)(shuffleMem + (threadIdx.x & 0xFFFFFFF8)); #else volatile uint32_t* sPtr = NULL; #endif - for ( int i = start; i < end; i += 32 ) + for(int i = start; i < end; i += 32) { - #pragma unroll - for ( int j = 0; j < 4; ++j ) - text[j] ^= long_state[((IndexType) thread * MEMORY) + ( sub + i + j)]; +#pragma unroll + for(int j = 0; j < 4; ++j) + text[j] ^= long_state[((IndexType)thread * MEMORY) + (sub + i + j)]; - cn_aes_pseudo_round_mut( sharedMemory, text, key ); + cn_aes_pseudo_round_mut(sharedMemory, text, key); if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { - #pragma unroll - for ( int j = 0; j < 4; ++j ) - text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv+1)&7); +#pragma unroll + for(int j = 0; j < 4; ++j) + text[j] ^= shuffle<8>(sPtr, subv, text[j], (subv + 1) & 7); } } - MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 ); + MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2); } -template +template void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo) { uint32_t MASK = algo.Mask(); uint32_t ITERATIONS = algo.Iter(); - size_t MEM = algo.Mem()/4; + size_t MEM = algo.Mem() / 4; - dim3 grid( ctx->device_blocks ); - dim3 block( ctx->device_threads ); - dim3 block2( ctx->device_threads << 1 ); - dim3 block4( ctx->device_threads << 2 ); - dim3 block8( ctx->device_threads << 3 ); + dim3 grid(ctx->device_blocks); + dim3 block(ctx->device_threads); + dim3 block2(ctx->device_threads << 1); + dim3 block4(ctx->device_threads << 2); + dim3 block8(ctx->device_threads << 3); int partcount = 1 << ctx->device_bfactor; @@ -823,27 +829,29 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo * kernel splitting if the user defined a `bfactor >= 5` */ int bfactorOneThree = ctx->device_bfactor - 4; - if( bfactorOneThree < 0 ) + if(bfactorOneThree < 0) bfactorOneThree = 0; int partcountOneThree = 1 << bfactorOneThree; - for ( int i = 0; i < partcountOneThree; i++ ) + for(int i = 0; i < partcountOneThree; i++) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>( - ITERATIONS, - MEM, - ctx->device_blocks*ctx->device_threads, - bfactorOneThree, i, - ctx->d_long_state, - (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state), - ctx->d_ctx_key1 )); - - if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<>>( + ITERATIONS, + MEM, + ctx->device_blocks * ctx->device_threads, + bfactorOneThree, i, + ctx->d_long_state, + (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state), + ctx->d_ctx_key1)); + + if(partcount > 1 && ctx->device_bsleep > 0) + compat_usleep(ctx->device_bsleep); } - if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); + if(partcount > 1 && ctx->device_bsleep > 0) + compat_usleep(ctx->device_bsleep); - for ( int i = 0; i < partcount; i++ ) + for(int i = 0; i < partcount; i++) { if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) { @@ -856,12 +864,11 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo block2, sizeof(uint64_t) * block.x * 8 + // shuffle memory for fermi gpus - block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) - >>>( + block2.x * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3)>>>( ITERATIONS, MEM, MASK, - ctx->device_blocks*ctx->device_threads, + ctx->device_blocks * ctx->device_threads, ctx->device_bfactor, i, ctx->d_long_state, @@ -869,28 +876,24 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo ctx->d_ctx_b, ctx->d_ctx_state, nonce, - ctx->d_input - ) - ); + ctx->d_input)); } else if(ALGO == cryptonight_r_wow || ALGO == cryptonight_r) { - int numThreads = ctx->device_blocks*ctx->device_threads; + int numThreads = ctx->device_blocks * ctx->device_threads; void* args[] = { &ITERATIONS, &MEM, &MASK, &numThreads, &ctx->device_bfactor, &i, - &ctx->d_long_state, &ctx->d_ctx_a, &ctx->d_ctx_b, &ctx->d_ctx_state, &nonce, &ctx->d_input - }; + &ctx->d_long_state, &ctx->d_ctx_a, &ctx->d_ctx_b, &ctx->d_ctx_state, &nonce, &ctx->d_input}; CU_CHECK(ctx->device_id, cuLaunchKernel( - ctx->kernel, - grid.x, grid.y, grid.z, - block2.x, block2.y, block2.z, - sizeof(uint64_t) * block.x * 8 + - // shuffle memory for fermi gpus - block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ), - nullptr, - args, 0 - )); + ctx->kernel, + grid.x, grid.y, grid.z, + block2.x, block2.y, block2.z, + sizeof(uint64_t) * block.x * 8 + + // shuffle memory for fermi gpus + block2.x * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3), + nullptr, + args, 0)); CU_CHECK(ctx->device_id, cuCtxSynchronize()); } else @@ -901,12 +904,11 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo cryptonight_core_gpu_phase2_quad<<< grid, block4, - block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) - >>>( + block4.x * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3)>>>( ITERATIONS, MEM, MASK, - ctx->device_blocks*ctx->device_threads, + ctx->device_blocks * ctx->device_threads, ctx->device_bfactor, i, ctx->d_long_state, @@ -914,57 +916,54 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo ctx->d_ctx_b, ctx->d_ctx_state, nonce, - ctx->d_input - ) - ); + ctx->d_input)); } - if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); + if(partcount > 1 && ctx->device_bsleep > 0) + compat_usleep(ctx->device_bsleep); } int roundsPhase3 = partcountOneThree; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven|| ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { // cryptonight_heavy used two full rounds over the scratchpad memory roundsPhase3 *= 2; } - for ( int i = 0; i < roundsPhase3; i++ ) + for(int i = 0; i < roundsPhase3; i++) { CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< - grid, - block8, - block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) - >>>( - ITERATIONS, - MEM, - ctx->device_blocks*ctx->device_threads, - bfactorOneThree, i, - ctx->d_long_state, - ctx->d_ctx_state, ctx->d_ctx_key2 )); + grid, + block8, + block8.x * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3)>>>( + ITERATIONS, + MEM, + ctx->device_blocks * ctx->device_threads, + bfactorOneThree, i, + ctx->d_long_state, + ctx->d_ctx_state, ctx->d_ctx_key2)); } } -template +template void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo) { const uint32_t MASK = algo.Mask(); const uint32_t ITERATIONS = algo.Iter(); const size_t MEM = algo.Mem(); - dim3 grid( ctx->device_blocks ); - dim3 block( ctx->device_threads ); - dim3 block2( ctx->device_threads << 1 ); - dim3 block4( ctx->device_threads << 2 ); - dim3 block8( ctx->device_threads << 3 ); + dim3 grid(ctx->device_blocks); + dim3 block(ctx->device_threads); + dim3 block2(ctx->device_threads << 1); + dim3 block4(ctx->device_threads << 2); + dim3 block8(ctx->device_threads << 3); size_t intensity = ctx->device_blocks * ctx->device_threads; CUDA_CHECK_KERNEL( ctx->device_id, - xmrstak::nvidia::cn_explode_gpu<<>>(MEM, (int*)ctx->d_ctx_state, (int*)ctx->d_long_state) - ); + xmrstak::nvidia::cn_explode_gpu<<>>(MEM, (int*)ctx->d_ctx_state, (int*)ctx->d_long_state)); int partcount = 1 << ctx->device_bfactor; for(int i = 0; i < partcount; i++) @@ -972,20 +971,16 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_ CUDA_CHECK_KERNEL( ctx->device_id, // 36 x 16byte x numThreads - xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu - <<device_blocks, ctx->device_threads * 16, 32 * 16 * ctx->device_threads>>> - ( - ITERATIONS, - MEM, - MASK, - (int*)ctx->d_ctx_state, - (int*)ctx->d_long_state, - ctx->device_bfactor, - i, - ctx->d_ctx_a, - ctx->d_ctx_b - ) - ); + xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu<<device_blocks, ctx->device_threads * 16, 32 * 16 * ctx->device_threads>>>( + ITERATIONS, + MEM, + MASK, + (int*)ctx->d_ctx_state, + (int*)ctx->d_long_state, + ctx->device_bfactor, + i, + ctx->d_ctx_a, + ctx->d_ctx_b)); } /* bfactor for phase 3 @@ -994,32 +989,31 @@ void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_ * kernel splitting if the user defined a `bfactor >= 5` */ int bfactorOneThree = ctx->device_bfactor - 4; - if( bfactorOneThree < 0 ) + if(bfactorOneThree < 0) bfactorOneThree = 0; int partcountOneThree = 1 << bfactorOneThree; int roundsPhase3 = partcountOneThree; if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || - ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ) + ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { // cryptonight_heavy used two full rounds over the scratchpad memory roundsPhase3 *= 2; } - for ( int i = 0; i < roundsPhase3; i++ ) + for(int i = 0; i < roundsPhase3; i++) { CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<<< - grid, - block8, - block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ) - >>>( - ITERATIONS, - MEM/4, - ctx->device_blocks*ctx->device_threads, - bfactorOneThree, i, - ctx->d_long_state, - ctx->d_ctx_state, ctx->d_ctx_key2 )); + grid, + block8, + block8.x * sizeof(uint32_t) * static_cast(ctx->device_arch[0] < 3)>>>( + ITERATIONS, + MEM / 4, + ctx->device_blocks * ctx->device_threads, + bfactorOneThree, i, + ctx->d_long_state, + ctx->d_ctx_state, ctx->d_ctx_key2)); } } @@ -1030,7 +1024,7 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui { if(ctx->kernel_height != chain_height || ctx->cached_algo != miner_algo) { - if(ctx->module) + if(ctx->module) cuModuleUnload(ctx->module); uint32_t PRECOMPILATION_DEPTH = 4; @@ -1045,15 +1039,16 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui ctx->kernel_height = chain_height; ctx->cached_algo = miner_algo; - for (int i = 1; i <= PRECOMPILATION_DEPTH; ++i) + for(int i = 1; i <= PRECOMPILATION_DEPTH; ++i) xmrstak::nvidia::CryptonightR_get_program(ptx, lowered_name, miner_algo, chain_height + i, PRECOMPILATION_DEPTH, ctx->device_arch[0], ctx->device_arch[1], true); } } - typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo); + typedef void (*cuda_hash_fn)(nvid_ctx * ctx, uint32_t nonce, const xmrstak_algo& algo); - if(miner_algo == invalid_algo) return; + if(miner_algo == invalid_algo) + return; static const cuda_hash_fn func_table[] = { cryptonight_core_gpu_hash, @@ -1105,13 +1100,11 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui cryptonight_core_gpu_hash, cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash - }; + cryptonight_core_gpu_hash}; std::bitset<1> digit; digit.set(0, ctx->memMode == 1); - cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ]; + cuda_hash_fn selected_function = func_table[((miner_algo - 1u) << 1) | digit.to_ulong()]; selected_function(ctx, startNonce, miner_algo); - } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp index fee7e13d1..a66804ecf 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp @@ -1,11 +1,11 @@ #pragma once +#include #include #include -#include -#include "cuda_keccak.hpp" #include "cuda_extra.hpp" +#include "cuda_keccak.hpp" namespace xmrstak { @@ -15,7 +15,7 @@ namespace nvidia struct __m128i : public int4 { - __forceinline__ __device__ __m128i(){} + __forceinline__ __device__ __m128i() {} __forceinline__ __device__ __m128i( const uint32_t x0, const uint32_t x1, @@ -27,7 +27,7 @@ struct __m128i : public int4 w = x3; } - __forceinline__ __device__ __m128i( const int x0) + __forceinline__ __device__ __m128i(const int x0) { x = x0; y = x0; @@ -41,8 +41,7 @@ struct __m128i : public int4 x | other.x, y | other.y, z | other.z, - w | other.w - ); + w | other.w); } __forceinline__ __device__ __m128i operator^(const __m128i& other) @@ -51,15 +50,14 @@ struct __m128i : public int4 x ^ other.x, y ^ other.y, z ^ other.z, - w ^ other.w - ); + w ^ other.w); } }; struct __m128 : public float4 { - __forceinline__ __device__ __m128(){} + __forceinline__ __device__ __m128() {} __forceinline__ __device__ __m128( const float x0, const float x1, @@ -71,7 +69,7 @@ struct __m128 : public float4 float4::w = x3; } - __forceinline__ __device__ __m128( const float x0) + __forceinline__ __device__ __m128(const float x0) { float4::x = x0; float4::y = x0; @@ -79,7 +77,7 @@ struct __m128 : public float4 float4::w = x0; } - __forceinline__ __device__ __m128( const __m128i& x0) + __forceinline__ __device__ __m128(const __m128i& x0) { float4::x = int2float(x0.x); float4::y = int2float(x0.y); @@ -87,14 +85,13 @@ struct __m128 : public float4 float4::w = int2float(x0.w); } - __forceinline__ __device__ __m128i get_int( ) + __forceinline__ __device__ __m128i get_int() { return __m128i( (int)x, (int)y, (int)z, - (int)w - ); + (int)w); } __forceinline__ __device__ __m128 operator+(const __m128& other) @@ -103,8 +100,7 @@ struct __m128 : public float4 x + other.x, y + other.y, z + other.z, - w + other.w - ); + w + other.w); } __forceinline__ __device__ __m128 operator-(const __m128& other) @@ -113,8 +109,7 @@ struct __m128 : public float4 x - other.x, y - other.y, z - other.z, - w - other.w - ); + w - other.w); } __forceinline__ __device__ __m128 operator*(const __m128& other) @@ -123,8 +118,7 @@ struct __m128 : public float4 x * other.x, y * other.y, z * other.z, - w * other.w - ); + w * other.w); } __forceinline__ __device__ __m128 operator/(const __m128& other) @@ -133,67 +127,64 @@ struct __m128 : public float4 x / other.x, y / other.y, z / other.z, - w / other.w - ); + w / other.w); } __forceinline__ __device__ __m128& trunc() { - x=::truncf(x); - y=::truncf(y); - z=::truncf(z); - w=::truncf(w); + x = ::truncf(x); + y = ::truncf(y); + z = ::truncf(z); + w = ::truncf(w); return *this; } __forceinline__ __device__ __m128& abs() { - x=::fabsf(x); - y=::fabsf(y); - z=::fabsf(z); - w=::fabsf(w); + x = ::fabsf(x); + y = ::fabsf(y); + z = ::fabsf(z); + w = ::fabsf(w); return *this; } __forceinline__ __device__ __m128& floor() { - x=::floorf(x); - y=::floorf(y); - z=::floorf(z); - w=::floorf(w); + x = ::floorf(x); + y = ::floorf(y); + z = ::floorf(z); + w = ::floorf(w); return *this; } }; - -template +template __device__ void print(const char* name, T value) { printf("g %s: ", name); for(int i = 0; i < 4; ++i) { - printf("%08X ",((uint32_t*)&value)[i]); + printf("%08X ", ((uint32_t*)&value)[i]); } printf("\n"); } -template<> +template <> __device__ void print<__m128>(const char* name, __m128 value) { printf("g %s: ", name); for(int i = 0; i < 4; ++i) { - printf("%f ",((float*)&value)[i]); + printf("%f ", ((float*)&value)[i]); } printf("\n"); } #define SHOW(name) print(#name, name) - __forceinline__ __device__ __m128 _mm_add_ps(__m128 a, __m128 b) { return a + b; @@ -220,8 +211,7 @@ __forceinline__ __device__ __m128 _mm_and_ps(__m128 a, int b) int_as_float(float_as_int(a.x) & b), int_as_float(float_as_int(a.y) & b), int_as_float(float_as_int(a.z) & b), - int_as_float(float_as_int(a.w) & b) - ); + int_as_float(float_as_int(a.w) & b)); } __forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b) @@ -230,8 +220,7 @@ __forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b) int_as_float(float_as_int(a.x) | b), int_as_float(float_as_int(a.y) | b), int_as_float(float_as_int(a.z) | b), - int_as_float(float_as_int(a.w) | b) - ); + int_as_float(float_as_int(a.w) | b)); } __forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b) @@ -240,20 +229,18 @@ __forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b) int_as_float(float_as_int(a.x) ^ b), int_as_float(float_as_int(a.y) ^ b), int_as_float(float_as_int(a.z) ^ b), - int_as_float(float_as_int(a.w) ^ b) - ); + int_as_float(float_as_int(a.w) ^ b)); } __forceinline__ __device__ __m128 _mm_fmod_ps(__m128 v, float dc) { __m128 d(dc); __m128 c = _mm_div_ps(v, d); - c.trunc();//_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); + c.trunc(); //_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); // c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c)); - sse2 c = _mm_mul_ps(c, d); return _mm_sub_ps(v, c); - //return a.fmodf(b); } @@ -262,23 +249,20 @@ __forceinline__ __device__ __m128i _mm_xor_si128(__m128i a, __m128i b) return a ^ b; } - __forceinline__ __device__ __m128i _mm_alignr_epi8(__m128i a, const uint32_t rot) { const uint32_t right = 8 * rot; const uint32_t left = (32 - 8 * rot); return __m128i( - ((uint32_t)a.x >> right) | ( a.y << left ), - ((uint32_t)a.y >> right) | ( a.z << left ), - ((uint32_t)a.z >> right) | ( a.w << left ), - ((uint32_t)a.w >> right) | ( a.x << left ) - ); + ((uint32_t)a.x >> right) | (a.y << left), + ((uint32_t)a.y >> right) | (a.z << left), + ((uint32_t)a.z >> right) | (a.w << left), + ((uint32_t)a.w >> right) | (a.x << left)); } -__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int *lpad, const uint32_t MASK) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); } - +__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int* lpad, const uint32_t MASK) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); } -__forceinline__ __device__ __m128 fma_break(__m128 x) +__forceinline__ __device__ __m128 fma_break(__m128 x) { // Break the dependency chain by setitng the exp to ?????01 x = _mm_and_ps(x, 0xFEFFFFFF); @@ -290,13 +274,13 @@ __forceinline__ __device__ void sub_round(__m128 n0, __m128 n1, __m128 n2, __m12 { n1 = _mm_add_ps(n1, c); __m128 nn = _mm_mul_ps(n0, c); - nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn)); + nn = _mm_mul_ps(n1, _mm_mul_ps(nn, nn)); nn = fma_break(nn); n = _mm_add_ps(n, nn); n3 = _mm_sub_ps(n3, c); __m128 dd = _mm_mul_ps(n2, c); - dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd)); + dd = _mm_mul_ps(n3, _mm_mul_ps(dd, dd)); dd = fma_break(dd); d = _mm_add_ps(d, dd); @@ -326,7 +310,7 @@ __forceinline__ __device__ void round_compute(__m128 n0, __m128 n1, __m128 n2, _ // Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0 d = _mm_and_ps(d, 0xFF7FFFFF); d = _mm_or_ps(d, 0x40000000); - r =_mm_add_ps(r, _mm_div_ps(n,d)); + r = _mm_add_ps(r, _mm_div_ps(n, d)); } // 74*8 = 595 @@ -335,15 +319,14 @@ __forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n __m128 c(cnt); // 35 maths calls follow (140 FLOPS) __m128 r = __m128(0.0f); - for(int i=0; i< 4; ++i) + for(int i = 0; i < 4; ++i) round_compute(n0, n1, n2, n3, rnd_c, c, r); // do a quick fmod by setting exp to 2 r = _mm_and_ps(r, 0x807FFFFF); r = _mm_or_ps(r, 0x40000000); - sum = r; // 34 + sum = r; // 34 r = _mm_mul_ps(r, __m128(536870880.0f)); // 35 return r.get_int(); - } __forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out) @@ -376,8 +359,7 @@ __constant__ uint32_t look[16][4] = { {3, 1, 2, 0}, {3, 2, 0, 1}, {3, 0, 1, 2}, - {3, 0, 2, 1} -}; + {3, 0, 2, 1}}; __constant__ float ccnt[16] = { 1.34375f, @@ -398,16 +380,14 @@ __constant__ float ccnt[16] = { 1.3203125f, 1.3515625f, 1.3359375f, - 1.4609375f -}; - + 1.4609375f}; __forceinline__ __device__ void sync() { -#if (__CUDACC_VER_MAJOR__ >= 9) +#if(__CUDACC_VER_MAJOR__ >= 9) __syncwarp(); #else - __syncthreads( ); + __syncthreads(); #endif } @@ -418,11 +398,11 @@ struct SharedMemChunk }; __global__ void cryptonight_core_gpu_phase2_gpu( - const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, - int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS) + const uint32_t ITERATIONS, const size_t MEMORY, const uint32_t MASK, + int32_t* spad, int* lpad_in, int bfactor, int partidx, uint32_t* roundVs, uint32_t* roundS) { - const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor ); + const int batchsize = (ITERATIONS * 2) >> (1 + bfactor); extern __shared__ SharedMemChunk smemExtern_in[]; @@ -435,7 +415,7 @@ __global__ void cryptonight_core_gpu_phase2_gpu( uint32_t tid = threadIdx.x % 16; - const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x/16; + const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x / 16; uint32_t s = 0; __m128 vs(0); @@ -470,8 +450,7 @@ __global__ void cryptonight_core_gpu_phase2_gpu( *(smem->out + look[tid][2]), *(smem->out + look[tid][3]), ccnt[tid], rc, smem->va[tid], - smem->out[tid] - ); + smem->out[tid]); sync(); @@ -483,7 +462,7 @@ __global__ void cryptonight_core_gpu_phase2_gpu( ((int*)smem->out)[tid] = outXor; float va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4]; - float va_tmp2 = ((float*)smem->va)[block+ 8] + ((float*)smem->va)[block + 12]; + float va_tmp2 = ((float*)smem->va)[block + 8] + ((float*)smem->va)[block + 12]; ((float*)smem->va)[tid] = va_tmp1 + va_tmp2; sync(); @@ -505,10 +484,10 @@ __global__ void cryptonight_core_gpu_phase2_gpu( vs = _mm_div_ps(vs, __m128(64.0f)); s = out2.x ^ out2.y ^ out2.z ^ out2.w; } - if(partidx != ((1<> ( 1 + bfactor ); const int start = partidx * batchsize; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp index 96cb679f5..48ebe4bd7 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp @@ -2,8 +2,8 @@ #pragma once #include -#include #include +#include #include /** execute and check a CUDA api command @@ -12,27 +12,30 @@ * @param msg message string which should be added to the error message * @param ... CUDA api command */ -#define CUDA_CHECK_MSG(id, msg, ...) { \ - cudaError_t error = __VA_ARGS__; \ - if(error!=cudaSuccess){ \ - std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__; \ - std::cerr << msg << std::endl; \ - throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \ - } \ -} \ -( (void) 0 ) - -#define CU_CHECK(id, ...) { \ - CUresult result = __VA_ARGS__; \ - if(result != CUDA_SUCCESS){ \ - const char* s; \ - cuGetErrorString(result, &s); \ - std::cerr << "[CUDA] Error gpu " << id << ": <" << __FUNCTION__ << ">:" << __LINE__ << " \"" << (s ? s : "unknown error") << "\"" << std::endl; \ - throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(s ? s : "unknown error")); \ - } \ -} \ -( (void) 0 ) +#define CUDA_CHECK_MSG(id, msg, ...) \ + { \ + cudaError_t error = __VA_ARGS__; \ + if(error != cudaSuccess) \ + { \ + std::cerr << "[CUDA] Error gpu " << id << ": <" << __FILE__ << ">:" << __LINE__; \ + std::cerr << msg << std::endl; \ + throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(cudaGetErrorString(error))); \ + } \ + } \ + ((void)0) +#define CU_CHECK(id, ...) \ + { \ + CUresult result = __VA_ARGS__; \ + if(result != CUDA_SUCCESS) \ + { \ + const char* s; \ + cuGetErrorString(result, &s); \ + std::cerr << "[CUDA] Error gpu " << id << ": <" << __FUNCTION__ << ">:" << __LINE__ << " \"" << (s ? s : "unknown error") << "\"" << std::endl; \ + throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(s ? s : "unknown error")); \ + } \ + } \ + ((void)0) /** execute and check a CUDA api command * @@ -47,7 +50,7 @@ * @param ... CUDA kernel call */ #define CUDA_CHECK_KERNEL(id, ...) \ - __VA_ARGS__; \ + __VA_ARGS__; \ CUDA_CHECK(id, cudaGetLastError()) /** execute and check a CUDA kernel @@ -57,5 +60,5 @@ * @param ... CUDA kernel call */ #define CUDA_CHECK_MSG_KERNEL(id, msg, ...) \ - __VA_ARGS__; \ + __VA_ARGS__; \ CUDA_CHECK_MSG(id, msg, cudaGetLastError()) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index b6e41c619..aa7c17057 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -1,83 +1,80 @@ -#include -#include -#include -#include +#include "xmrstak/jconf.hpp" #include -#include #include #include -#include -#include "xmrstak/jconf.hpp" - +#include +#include +#include +#include +#include typedef unsigned char BitSequence; typedef unsigned long long DataLength; -#include "xmrstak/backend/cryptonight.hpp" #include "cryptonight.hpp" -#include "cuda_extra.hpp" -#include "cuda_keccak.hpp" +#include "cuda_aes.hpp" #include "cuda_blake.hpp" +#include "cuda_device.hpp" +#include "cuda_extra.hpp" #include "cuda_groestl.hpp" #include "cuda_jh.hpp" +#include "cuda_keccak.hpp" #include "cuda_skein.hpp" -#include "cuda_device.hpp" -#include "cuda_aes.hpp" +#include "xmrstak/backend/cryptonight.hpp" -__constant__ uint8_t d_sub_byte[16][16] ={ - {0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 }, - {0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 }, - {0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 }, - {0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 }, - {0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 }, - {0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf }, - {0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 }, - {0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 }, - {0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 }, - {0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb }, - {0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 }, - {0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 }, - {0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a }, - {0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e }, - {0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf }, - {0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 } -}; - -__device__ __forceinline__ void cryptonight_aes_set_key( uint32_t * __restrict__ key, const uint32_t * __restrict__ data ) +__constant__ uint8_t d_sub_byte[16][16] = { + {0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76}, + {0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0}, + {0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15}, + {0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75}, + {0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84}, + {0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf}, + {0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8}, + {0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2}, + {0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73}, + {0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb}, + {0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79}, + {0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08}, + {0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a}, + {0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e}, + {0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf}, + {0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}}; + +__device__ __forceinline__ void cryptonight_aes_set_key(uint32_t* __restrict__ key, const uint32_t* __restrict__ data) { int i, j; uint8_t temp[4]; - const uint32_t aes_gf[] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 }; + const uint32_t aes_gf[] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36}; - MEMSET4( key, 0, 40 ); - MEMCPY4( key, data, 8 ); + MEMSET4(key, 0, 40); + MEMCPY4(key, data, 8); #pragma unroll - for ( i = 8; i < 40; i++ ) + for(i = 8; i < 40; i++) { - *(uint32_t *) temp = key[i - 1]; - if ( i % 8 == 0 ) + *(uint32_t*)temp = key[i - 1]; + if(i % 8 == 0) { - *(uint32_t *) temp = ROTR32( *(uint32_t *) temp, 8 ); - for ( j = 0; j < 4; j++ ) - temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f]; - *(uint32_t *) temp ^= aes_gf[i / 8 - 1]; + *(uint32_t*)temp = ROTR32(*(uint32_t*)temp, 8); + for(j = 0; j < 4; j++) + temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f]; + *(uint32_t*)temp ^= aes_gf[i / 8 - 1]; } else { - if ( i % 8 == 4 ) + if(i % 8 == 4) { #pragma unroll - for ( j = 0; j < 4; j++ ) - temp[j] = d_sub_byte[( temp[j] >> 4 ) & 0x0f][temp[j] & 0x0f]; + for(j = 0; j < 4; j++) + temp[j] = d_sub_byte[(temp[j] >> 4) & 0x0f][temp[j] & 0x0f]; } } - key[i] = key[( i - 8 )] ^ *(uint32_t *) temp; + key[i] = key[(i - 8)] ^ *(uint32_t*)temp; } } -__device__ __forceinline__ void mix_and_propagate( uint32_t* state ) +__device__ __forceinline__ void mix_and_propagate(uint32_t* state) { uint32_t tmp0[4]; for(size_t x = 0; x < 4; ++x) @@ -93,18 +90,18 @@ __device__ __forceinline__ void mix_and_propagate( uint32_t* state ) (state + 4 * 7)[x] = (state + 4 * 7)[x] ^ tmp0[x]; } -template -__global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_state2, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 ) +template +__global__ void cryptonight_extra_gpu_prepare(int threads, uint32_t* __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t* __restrict__ d_ctx_state, uint32_t* __restrict__ d_ctx_state2, uint32_t* __restrict__ d_ctx_a, uint32_t* __restrict__ d_ctx_b, uint32_t* __restrict__ d_ctx_key1, uint32_t* __restrict__ d_ctx_key2) { - int thread = ( blockDim.x * blockIdx.x + threadIdx.x ); + int thread = (blockDim.x * blockIdx.x + threadIdx.x); __shared__ uint32_t sharedMemory[1024]; if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { - cn_aes_gpu_init( sharedMemory ); - __syncthreads( ); + cn_aes_gpu_init(sharedMemory); + __syncthreads(); } - if ( thread >= threads ) + if(thread >= threads) return; uint32_t ctx_state[50]; @@ -114,29 +111,29 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric uint32_t ctx_key2[40]; uint32_t input[32]; - memcpy( input, d_input, len ); + memcpy(input, d_input, len); //*((uint32_t *)(((char *)input) + 39)) = startNonce + thread; uint32_t nonce = startNonce + thread; - for ( int i = 0; i < sizeof (uint32_t ); ++i ) - ( ( (char *) input ) + 39 )[i] = ( (char*) ( &nonce ) )[i]; //take care of pointer alignment + for(int i = 0; i < sizeof(uint32_t); ++i) + (((char*)input) + 39)[i] = ((char*)(&nonce))[i]; //take care of pointer alignment - cn_keccak( (uint8_t *) input, len, (uint8_t *) ctx_state ); - cryptonight_aes_set_key( ctx_key1, ctx_state ); - cryptonight_aes_set_key( ctx_key2, ctx_state + 8 ); + cn_keccak((uint8_t*)input, len, (uint8_t*)ctx_state); + cryptonight_aes_set_key(ctx_key1, ctx_state); + cryptonight_aes_set_key(ctx_key2, ctx_state + 8); - XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a ); - XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b ); - memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 ); + XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a); + XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b); + memcpy(d_ctx_a + thread * 4, ctx_a, 4 * 4); if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) { - memcpy( d_ctx_b + thread * 16, ctx_b, 4 * 4 ); + memcpy(d_ctx_b + thread * 16, ctx_b, 4 * 4); // bx1 - XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b ); - memcpy( d_ctx_b + thread * 16 + 4, ctx_b, 4 * 4 ); + XOR_BLOCKS_DST(ctx_state + 16, ctx_state + 20, ctx_b); + memcpy(d_ctx_b + thread * 16 + 4, ctx_b, 4 * 4); // division_result - memcpy( d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 2 ); + memcpy(d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 2); // sqrt_result - memcpy( d_ctx_b + thread * 16 + 2 * 4 + 2, ctx_state + 26, 4 * 2 ); + memcpy(d_ctx_b + thread * 16 + 2 * 4 + 2, ctx_state + 26, 4 * 2); } else if(ALGO == cryptonight_r_wow || ALGO == cryptonight_r) { @@ -148,31 +145,31 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric memcpy(d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 8); } else - memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 ); + memcpy(d_ctx_b + thread * 4, ctx_b, 4 * 4); - memcpy( d_ctx_key1 + thread * 40, ctx_key1, 40 * 4 ); - memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 ); - memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 ); + memcpy(d_ctx_key1 + thread * 40, ctx_key1, 40 * 4); + memcpy(d_ctx_key2 + thread * 40, ctx_key2, 40 * 4); + memcpy(d_ctx_state + thread * 50, ctx_state, 50 * 4); if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { - for(int i=0; i < 16; i++) + for(int i = 0; i < 16; i++) { for(size_t t = 4; t < 12; ++t) { - cn_aes_pseudo_round_mut( sharedMemory, ctx_state + 4u * t, ctx_key1 ); + cn_aes_pseudo_round_mut(sharedMemory, ctx_state + 4u * t, ctx_key1); } // scipt first 4 * 128bit blocks = 4 * 4 uint32_t values mix_and_propagate(ctx_state + 4 * 4); } // double buffer to move manipulated state into phase1 - memcpy( d_ctx_state2 + thread * 50, ctx_state, 50 * 4 ); + memcpy(d_ctx_state2 + thread * 50, ctx_state, 50 * 4); } } -template -__global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 ) +template +__global__ void cryptonight_extra_gpu_final(int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t* __restrict__ d_res_nonce, uint32_t* __restrict__ d_ctx_state, uint32_t* __restrict__ d_ctx_key2) { const int thread = blockDim.x * blockIdx.x + threadIdx.x; @@ -181,19 +178,19 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { - cn_aes_gpu_init( sharedMemory ); - __syncthreads( ); + cn_aes_gpu_init(sharedMemory); + __syncthreads(); } - if ( thread >= threads ) + if(thread >= threads) return; int i; - uint32_t * __restrict__ ctx_state = d_ctx_state + thread * 50; + uint32_t* __restrict__ ctx_state = d_ctx_state + thread * 50; uint64_t hash[4]; uint32_t state[50]; - #pragma unroll - for ( i = 0; i < 50; i++ ) +#pragma unroll + for(i = 0; i < 50; i++) state[i] = ctx_state[i]; if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || @@ -202,25 +199,25 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 uint32_t key[40]; // load keys - MEMCPY8( key, d_ctx_key2 + thread * 40, 20 ); + MEMCPY8(key, d_ctx_key2 + thread * 40, 20); - for(int i=0; i < 16; i++) + for(int i = 0; i < 16; i++) { for(size_t t = 4; t < 12; ++t) { - cn_aes_pseudo_round_mut( sharedMemory, state + 4u * t, key ); + cn_aes_pseudo_round_mut(sharedMemory, state + 4u * t, key); } // scipt first 4 * 128bit blocks = 4 * 4 uint32_t values mix_and_propagate(state + 4 * 4); } } - cn_keccakf2( (uint64_t *) state ); + cn_keccakf2((uint64_t*)state); if(ALGO == cryptonight_gpu) { - if ( ((uint64_t*)state)[3] < target ) + if(((uint64_t*)state)[3] < target) { - uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); + uint32_t idx = atomicInc(d_res_count, 0xFFFFFFFF); if(idx < 10) d_res_nonce[idx] = thread; @@ -228,19 +225,19 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 } else { - switch ( ( (uint8_t *) state )[0] & 0x03 ) + switch(((uint8_t*)state)[0] & 0x03) { case 0: - cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash ); + cn_blake((const uint8_t*)state, 200, (uint8_t*)hash); break; case 1: - cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash ); + cn_groestl((const BitSequence*)state, 200, (BitSequence*)hash); break; case 2: - cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash ); + cn_jh((const BitSequence*)state, 200, (BitSequence*)hash); break; case 3: - cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash ); + cn_skein((const BitSequence*)state, 200, (BitSequence*)hash); break; default: break; @@ -249,9 +246,9 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values // and expect an accurate result for target > 32-bit without implementing carries - if ( hash[3] < target ) + if(hash[3] < target) { - uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF ); + uint32_t idx = atomicInc(d_res_count, 0xFFFFFFFF); if(idx < 10) d_res_nonce[idx] = thread; @@ -259,10 +256,10 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 } } -extern "C" void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len ) +extern "C" void cryptonight_extra_cpu_set_data(nvid_ctx* ctx, const void* data, uint32_t len) { ctx->inputlen = len; - CUDA_CHECK(ctx->device_id, cudaMemcpy( ctx->d_input, data, len, cudaMemcpyHostToDevice )); + CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_input, data, len, cudaMemcpyHostToDevice)); } extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) @@ -290,7 +287,6 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) case 3: CUDA_CHECK(ctx->device_id, cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); break; - }; // prefer shared memory over L1 cache @@ -314,8 +310,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() || std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() || std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() || - std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end() - ) + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end()) { // extent ctx_b to hold the state of idx0 ctx_b_size += sizeof(uint32_t) * wsize; @@ -326,16 +321,14 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) { ctx_b_size += sizeof(uint32_t) * 4 * wsize; } - else if((std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) - || (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end())) + else if((std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) || (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end())) { // bx0 (16byte), bx1 (16byte), division_result (8byte) and sqrt_result (8byte), padding (16byte) ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize; } else if( std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r) != neededAlgorithms.end() || - std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end() - ) + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end()) { // bx0 (16byte), bx1 (16byte), and [r0, r1, r2, r3] (a 8byte) ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize; @@ -349,9 +342,9 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_a, 4 * sizeof(uint32_t) * wsize)); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, ctx_b_size)); // POW block format http://monero.wikia.com/wiki/PoW_Block_Header_Format - CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 32 * sizeof (uint32_t ) )); - CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) )); - CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) )); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 32 * sizeof(uint32_t))); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof(uint32_t))); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof(uint32_t))); CUDA_CHECK_MSG( ctx->device_id, "\n**suggestion: Try to reduce the value of the attribute 'threads' in the NVIDIA config file.**", @@ -364,106 +357,102 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce int threadsperblock = 128; uint32_t wsize = ctx->device_blocks * ctx->device_threads; - dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock ); - dim3 block( threadsperblock ); + dim3 grid((wsize + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); if(miner_algo == cryptonight_heavy) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_haven) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_superfast) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_bittube2) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_monero_v8) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_gpu) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_r) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_r_wow) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_v8_reversewaltz) { - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } else { /* pass two times d_ctx_state because the second state is used later in phase1, * the first is used than in phase3 */ - CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, - ctx->d_ctx_state, ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>(wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state, ctx->d_ctx_state, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2)); } } -extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo) +extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t* resnonce, const xmrstak_algo& miner_algo) { int threadsperblock = 128; uint32_t wsize = ctx->device_blocks * ctx->device_threads; - dim3 grid( ( wsize + threadsperblock - 1 ) / threadsperblock ); - dim3 block( threadsperblock ); + dim3 grid((wsize + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); - CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_nonce, 0xFF, 10 * sizeof (uint32_t ) )); - CUDA_CHECK(ctx->device_id, cudaMemset( ctx->d_result_count, 0, sizeof (uint32_t ) )); + CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_nonce, 0xFF, 10 * sizeof(uint32_t))); + CUDA_CHECK(ctx->device_id, cudaMemset(ctx->d_result_count, 0, sizeof(uint32_t))); if(miner_algo == cryptonight_heavy) { CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", - cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) - ); + cryptonight_extra_gpu_final<<>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_haven) { CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", - cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) - ); + cryptonight_extra_gpu_final<<>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_superfast) { CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", - cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) - ); + cryptonight_extra_gpu_final<<>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_bittube2) { CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", - cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) - ); + cryptonight_extra_gpu_final<<>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2)); } else if(miner_algo == cryptonight_gpu) { @@ -471,8 +460,7 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", - cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) - ); + cryptonight_extra_gpu_final<<>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2)); } else { @@ -480,16 +468,14 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, CUDA_CHECK_MSG_KERNEL( ctx->device_id, "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", - cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) - ); + cryptonight_extra_gpu_final<<>>(wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state, ctx->d_ctx_key2)); } - CUDA_CHECK(ctx->device_id, cudaMemcpy( rescount, ctx->d_result_count, sizeof (uint32_t ), cudaMemcpyDeviceToHost )); + CUDA_CHECK(ctx->device_id, cudaMemcpy(rescount, ctx->d_result_count, sizeof(uint32_t), cudaMemcpyDeviceToHost)); CUDA_CHECK_MSG( ctx->device_id, "\n**suggestion: Try to increase the attribute 'bfactor' in the NVIDIA config file.**", - cudaMemcpy( resnonce, ctx->d_result_nonce, 10 * sizeof (uint32_t ), cudaMemcpyDeviceToHost ) - ); + cudaMemcpy(resnonce, ctx->d_result_nonce, 10 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); /* There is only a 32bit limit for the counter on the device side * therefore this value can be greater than 10, in that case limit rescount @@ -497,11 +483,11 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, */ if(*rescount > 10) *rescount = 10; - for(int i=0; i < *rescount; i++) + for(int i = 0; i < *rescount; i++) resnonce[i] += startNonce; } -extern "C" int cuda_get_devicecount( int* deviceCount) +extern "C" int cuda_get_devicecount(int* deviceCount) { cudaError_t err; *deviceCount = 0; @@ -587,17 +573,17 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) std::vector arch; #define XMRSTAK_PP_TOSTRING1(str) #str #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str) - char const * archStringList = XMRSTAK_PP_TOSTRING(XMRSTAK_CUDA_ARCH_LIST); + char const* archStringList = XMRSTAK_PP_TOSTRING(XMRSTAK_CUDA_ARCH_LIST); #undef XMRSTAK_PP_TOSTRING #undef XMRSTAK_PP_TOSTRING1 std::stringstream ss(archStringList); //transform string list separated with `+` into a vector of integers int tmpArch; - while ( ss >> tmpArch ) - arch.push_back( tmpArch ); + while(ss >> tmpArch) + arch.push_back(tmpArch); - #define MSG_CUDA_NO_ARCH "WARNING: skip device - binary does not contain required device architecture\n" +#define MSG_CUDA_NO_ARCH "WARNING: skip device - binary does not contain required device architecture\n" if(gpuArch >= 20 && gpuArch < 30) { // compiled binary must support sm_20 for fermi @@ -618,7 +604,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) * with a sm_20 only compiled binary */ for(int i = 0; i < arch.size(); ++i) - if(arch[i] >= 30 && (minSupportedArch == 0 || arch[i] < minSupportedArch)) + if(arch[i] >= 30 && (minSupportedArch == 0 || arch[i] < minSupportedArch)) minSupportedArch = arch[i]; if(minSupportedArch < 30 || gpuArch < minSupportedArch) { @@ -630,7 +616,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms(); bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end(); - // set all device option those marked as auto (-1) to a valid value if(ctx->device_blocks == -1) { @@ -700,7 +685,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) int* tmp; cudaError_t err; - #define MSG_CUDA_FUNC_FAIL "WARNING: skip device - %s failed\n" +#define MSG_CUDA_FUNC_FAIL "WARNING: skip device - %s failed\n" // a device must be selected to get the right memory usage later on err = cudaSetDevice(ctx->device_id); if(err != cudaSuccess) @@ -716,7 +701,6 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) return 3; } - size_t freeMemory = 0; size_t totalMemory = 0; CUDA_CHECK(ctx->device_id, cudaMemGetInfo(&freeMemory, &totalMemory)); @@ -746,7 +730,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) size_t usedMem = totalMemory - freeMemory; if(usedMem >= maxMemUsage) { - printf("WARNING: skip device - already %s MiB memory in use\n", std::to_string(usedMem/byteToMiB).c_str()); + printf("WARNING: skip device - already %s MiB memory in use\n", std::to_string(usedMem / byteToMiB).c_str()); return 4; } else @@ -764,8 +748,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() || std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() || std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() || - std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end() - ) + std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end()) perThread += 50 * 4; // state double buffer size_t max_intensity = limitedMemory / perThread; @@ -806,19 +789,18 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) size_t blockOptimal = 8 * ctx->device_mpcount; // the following values are calculated with CUDA10 and the occupancy calculator - if(gpuArch == 35 || gpuArch/10 == 5 || gpuArch/10 == 6) - blockOptimal = 7 * ctx->device_mpcount; + if(gpuArch == 35 || gpuArch / 10 == 5 || gpuArch / 10 == 6) + blockOptimal = 7 * ctx->device_mpcount; if(gpuArch == 37) - blockOptimal = 14 * ctx->device_mpcount; + blockOptimal = 14 * ctx->device_mpcount; if(gpuArch >= 70) - blockOptimal = 6 * ctx->device_mpcount; + blockOptimal = 6 * ctx->device_mpcount; if(blockOptimal * threads * hashMemSize < limitedMemory) { ctx->device_threads = threads; ctx->device_blocks = blockOptimal; } - } } printf("device init succeeded\n"); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp index 4d369f843..09cdd6646 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp @@ -11,22 +11,22 @@ struct uint3 unsigned int x, y, z; }; -struct uint3 threadIdx; -struct uint3 blockIdx; -struct uint3 blockDim; -#define __funnelshift_r(a,b,c) 1 +struct uint3 threadIdx; +struct uint3 blockIdx; +struct uint3 blockDim; +#define __funnelshift_r(a, b, c) 1 #define __syncthreads() #define asm(x) -#define __shfl(a,b,c) 1 +#define __shfl(a, b, c) 1 #endif -#define AES_BLOCK_SIZE 16 -#define AES_KEY_SIZE 32 -#define INIT_SIZE_BLK 8 +#define AES_BLOCK_SIZE 16 +#define AES_KEY_SIZE 32 +#define INIT_SIZE_BLK 8 #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B -#define C32(x) ((uint32_t)(x ## U)) -#define T32(x) ((x) & C32(0xFFFFFFFF)) +#define C32(x) ((uint32_t)(x##U)) +#define T32(x) ((x)&C32(0xFFFFFFFF)) #if __CUDA_ARCH__ >= 350 __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset) @@ -34,71 +34,91 @@ __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int uint2 result; if(offset >= 32) { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); } else { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); } - return __double_as_longlong(__hiloint2double(result.y, result.x)); + return __double_as_longlong(__hiloint2double(result.y, result.x)); } #define ROTL64(x, n) (cuda_ROTL64(x, n)) #else -#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) #endif #if __CUDA_ARCH__ < 350 #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n)))) #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) #else -#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) -#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) +#define ROTL32(x, n) __funnelshift_l((x), (x), (n)) +#define ROTR32(x, n) __funnelshift_r((x), (x), (n)) #endif -#define MEMSET8(dst,what,cnt) { \ - int i_memset8; \ - uint64_t *out_memset8 = (uint64_t *)(dst); \ - for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \ - out_memset8[i_memset8] = (what); } - -#define MEMSET4(dst,what,cnt) { \ - int i_memset4; \ - uint32_t *out_memset4 = (uint32_t *)(dst); \ - for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \ - out_memset4[i_memset4] = (what); } - -#define MEMCPY8(dst,src,cnt) { \ - int i_memcpy8; \ - uint64_t *in_memcpy8 = (uint64_t *)(src); \ - uint64_t *out_memcpy8 = (uint64_t *)(dst); \ - for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \ - out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; } - -#define MEMCPY4(dst,src,cnt) { \ - int i_memcpy4; \ - uint32_t *in_memcpy4 = (uint32_t *)(src); \ - uint32_t *out_memcpy4 = (uint32_t *)(dst); \ - for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \ - out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; } - -#define XOR_BLOCKS(a,b) { \ - ((uint64_t *)a)[0] ^= ((uint64_t *)b)[0]; \ - ((uint64_t *)a)[1] ^= ((uint64_t *)b)[1]; } - -#define XOR_BLOCKS_DST(x,y,z) { \ - ((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \ - ((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; } - -#define MUL_SUM_XOR_DST(a,c,dst) { \ - const uint64_t dst0 = ((uint64_t *)dst)[0]; \ - uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], dst0, &hi) + ((uint64_t *)c)[1]; \ - hi += ((uint64_t *)c)[0]; \ - ((uint64_t *)c)[0] = dst0 ^ hi; \ - ((uint64_t *)dst)[0] = hi; \ - ((uint64_t *)c)[1] = atomicExch(((unsigned long long int *)dst) + 1, (unsigned long long int)lo) ^ lo; \ +#define MEMSET8(dst, what, cnt) \ + { \ + int i_memset8; \ + uint64_t* out_memset8 = (uint64_t*)(dst); \ + for(i_memset8 = 0; i_memset8 < cnt; i_memset8++) \ + out_memset8[i_memset8] = (what); \ } -#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff))) +#define MEMSET4(dst, what, cnt) \ + { \ + int i_memset4; \ + uint32_t* out_memset4 = (uint32_t*)(dst); \ + for(i_memset4 = 0; i_memset4 < cnt; i_memset4++) \ + out_memset4[i_memset4] = (what); \ + } + +#define MEMCPY8(dst, src, cnt) \ + { \ + int i_memcpy8; \ + uint64_t* in_memcpy8 = (uint64_t*)(src); \ + uint64_t* out_memcpy8 = (uint64_t*)(dst); \ + for(i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++) \ + out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; \ + } +#define MEMCPY4(dst, src, cnt) \ + { \ + int i_memcpy4; \ + uint32_t* in_memcpy4 = (uint32_t*)(src); \ + uint32_t* out_memcpy4 = (uint32_t*)(dst); \ + for(i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++) \ + out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; \ + } + +#define XOR_BLOCKS(a, b) \ + { \ + ((uint64_t*)a)[0] ^= ((uint64_t*)b)[0]; \ + ((uint64_t*)a)[1] ^= ((uint64_t*)b)[1]; \ + } + +#define XOR_BLOCKS_DST(x, y, z) \ + { \ + ((uint64_t*)z)[0] = ((uint64_t*)(x))[0] ^ ((uint64_t*)(y))[0]; \ + ((uint64_t*)z)[1] = ((uint64_t*)(x))[1] ^ ((uint64_t*)(y))[1]; \ + } + +#define MUL_SUM_XOR_DST(a, c, dst) \ + { \ + const uint64_t dst0 = ((uint64_t*)dst)[0]; \ + uint64_t hi, lo = cuda_mul128(((uint64_t*)a)[0], dst0, &hi) + ((uint64_t*)c)[1]; \ + hi += ((uint64_t*)c)[0]; \ + ((uint64_t*)c)[0] = dst0 ^ hi; \ + ((uint64_t*)dst)[0] = hi; \ + ((uint64_t*)c)[1] = atomicExch(((unsigned long long int*)dst) + 1, (unsigned long long int)lo) ^ lo; \ + } + +#define E2I(x) ((size_t)(((*((uint64_t*)(x)) >> 4) & 0x1ffff))) diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp index 555ccbef2..a8dd1fcb2 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp @@ -2,7 +2,6 @@ #include - __device__ __forceinline__ int64_t fast_div_heavy(int64_t _a, int _b) { diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp index 0d54f1436..1fc85b2d0 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp @@ -18,19 +18,19 @@ __device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b) { const uint32_t r = get_reciprocal(b); const uint32_t a1 = ((uint32_t*)&a)[1]; - const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * a1) + a; + const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r)*a1) + a; const uint32_t q = ((uint32_t*)&k)[1]; - int64_t tmp = a - ((uint64_t)(q) * b); + int64_t tmp = a - ((uint64_t)(q)*b); ((int32_t*)(&tmp))[1] -= q < a1 ? b : 0; - + const int overshoot = ((int*)(&tmp))[1] >> 31; const int64_t tmp_u = (uint32_t)(b - 1) - tmp; const int undershoot = ((int*)&tmp_u)[1] >> 31; uint64_t result; ((uint32_t*)&result)[0] = q + overshoot - undershoot; - ((uint32_t*)&result)[1] = ((uint32_t*)(&tmp))[0] + ((uint32_t)(overshoot) & b) - ((uint32_t)(undershoot) & b); + ((uint32_t*)&result)[1] = ((uint32_t*)(&tmp))[0] + ((uint32_t)(overshoot)&b) - ((uint32_t)(undershoot)&b); return result; } @@ -39,14 +39,18 @@ __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1) { float x = __uint_as_float((((uint32_t*)&n1)[1] >> 9) + ((64U + 127U) << 23)); float x1; - asm("rsqrt.approx.f32 %0, %1;" : "=f"(x1) : "f"(x)); - asm("sqrt.approx.f32 %0, %1;" : "=f"(x) : "f"(x)); + asm("rsqrt.approx.f32 %0, %1;" + : "=f"(x1) + : "f"(x)); + asm("sqrt.approx.f32 %0, %1;" + : "=f"(x) + : "f"(x)); // The following line does x1 *= 4294967296.0f; x1 = __uint_as_float(__float_as_uint(x1) + (32U << 23)); const uint32_t x0 = __float_as_uint(x) - (158U << 23); - const int64_t delta0 = n1 - (((int64_t)(x0) * x0) << 18); + const int64_t delta0 = n1 - (((int64_t)(x0)*x0) << 18); const float delta = __int2float_rn(((int32_t*)&delta0)[1]) * x1; uint32_t result = (x0 << 10) + __float2int_rn(delta); @@ -56,6 +60,6 @@ __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1) const uint64_t x2 = (uint64_t)(s) * (s + b) + ((uint64_t)(result) << 32) - n1; const int32_t overshoot = ((int64_t)(x2 + b) > 0) ? -1 : 0; const int32_t undershoot = ((int64_t)(x2 + 0x100000000UL + s) < 0) ? 1 : 0; - result += (overshoot+undershoot); + result += (overshoot + undershoot); return result; } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp index d5a98b7da..3bec5b1a2 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_groestl.hpp @@ -4,173 +4,142 @@ #define GROESTL_LENGTHFIELDLEN GROESTL_ROWS #define GROESTL_COLS512 8 -#define GROESTL_SIZE512 (GROESTL_ROWS*GROESTL_COLS512) +#define GROESTL_SIZE512 (GROESTL_ROWS * GROESTL_COLS512) #define GROESTL_ROUNDS512 10 #define GROESTL_HASH_BIT_LEN 256 #define GROESTL_ROTL32(v, n) ROTL32(v, n) - #define li_32(h) 0x##h##u -#define GROESTL_EXT_BYTE(var,n) ((uint8_t)((uint32_t)(var) >> (8*n))) +#define GROESTL_EXT_BYTE(var, n) ((uint8_t)((uint32_t)(var) >> (8 * n))) -#define u32BIG(a) \ - ((GROESTL_ROTL32(a,8) & li_32(00FF00FF)) | (GROESTL_ROTL32(a,24) & li_32(FF00FF00))) +#define u32BIG(a) \ + ((GROESTL_ROTL32(a, 8) & li_32(00FF00FF)) | (GROESTL_ROTL32(a, 24) & li_32(FF00FF00))) -typedef struct { - uint32_t chaining[GROESTL_SIZE512/sizeof(uint32_t)]; /* actual state */ +typedef struct +{ + uint32_t chaining[GROESTL_SIZE512 / sizeof(uint32_t)]; /* actual state */ uint32_t block_counter1, - block_counter2; /* message block counter(s) */ - BitSequence buffer[GROESTL_SIZE512]; /* data buffer */ - int buf_ptr; /* data buffer pointer */ - int bits_in_last_byte; /* no. of message bits in last byte of data buffer */ + block_counter2; /* message block counter(s) */ + BitSequence buffer[GROESTL_SIZE512]; /* data buffer */ + int buf_ptr; /* data buffer pointer */ + int bits_in_last_byte; /* no. of message bits in last byte of data buffer */ } groestlHashState; - __constant__ uint32_t d_groestl_T[512] = -{ - 0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc -, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5 -, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d -, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded -, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1 -, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441 -, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4 -, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba -, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616 -, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2 -, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c -, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de -, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7 -, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e -, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c -, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7 -, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b -, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4 -, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e -, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a -, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37 -, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86 -, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b -, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028 -, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3 -, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94 -, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836 -, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0 -, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2 -, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e -, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3 -, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e -}; - -#define GROESTL_ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) \ - { temp_var = (v1<<(8*amount_bytes))|(v2>>(8*(4-amount_bytes))); \ - v2 = (v2<<(8*amount_bytes))|(v1>>(8*(4-amount_bytes))); \ - v1 = temp_var; } - -#define GROESTL_COLUMN(x,y,i,c0,c1,c2,c3,c4,c5,c6,c7,tv1,tv2,tu,tl,t) \ - tu = d_groestl_T[2*(uint32_t)x[4*c0+0]]; \ - tl = d_groestl_T[2*(uint32_t)x[4*c0+0]+1]; \ - tv1 = d_groestl_T[2*(uint32_t)x[4*c1+1]]; \ - tv2 = d_groestl_T[2*(uint32_t)x[4*c1+1]+1]; \ - GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \ - tu ^= tv1; \ - tl ^= tv2; \ - tv1 = d_groestl_T[2*(uint32_t)x[4*c2+2]]; \ - tv2 = d_groestl_T[2*(uint32_t)x[4*c2+2]+1]; \ - GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \ - tu ^= tv1; \ - tl ^= tv2; \ - tv1 = d_groestl_T[2*(uint32_t)x[4*c3+3]]; \ - tv2 = d_groestl_T[2*(uint32_t)x[4*c3+3]+1]; \ - GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \ - tu ^= tv1; \ - tl ^= tv2; \ - tl ^= d_groestl_T[2*(uint32_t)x[4*c4+0]]; \ - tu ^= d_groestl_T[2*(uint32_t)x[4*c4+0]+1]; \ - tv1 = d_groestl_T[2*(uint32_t)x[4*c5+1]]; \ - tv2 = d_groestl_T[2*(uint32_t)x[4*c5+1]+1]; \ - GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,1,t) \ - tl ^= tv1; \ - tu ^= tv2; \ - tv1 = d_groestl_T[2*(uint32_t)x[4*c6+2]]; \ - tv2 = d_groestl_T[2*(uint32_t)x[4*c6+2]+1]; \ - GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,2,t) \ - tl ^= tv1; \ - tu ^= tv2; \ - tv1 = d_groestl_T[2*(uint32_t)x[4*c7+3]]; \ - tv2 = d_groestl_T[2*(uint32_t)x[4*c7+3]+1]; \ - GROESTL_ROTATE_COLUMN_DOWN(tv1,tv2,3,t) \ - tl ^= tv1; \ - tu ^= tv2; \ - y[i] = tu; \ - y[i+1] = tl; - -__device__ void cn_groestl_RND512P(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r) + { + 0xa5f432c6, 0xc6a597f4, 0x84976ff8, 0xf884eb97, 0x99b05eee, 0xee99c7b0, 0x8d8c7af6, 0xf68df78c, 0xd17e8ff, 0xff0de517, 0xbddc0ad6, 0xd6bdb7dc, 0xb1c816de, 0xdeb1a7c8, 0x54fc6d91, 0x915439fc, 0x50f09060, 0x6050c0f0, 0x3050702, 0x2030405, 0xa9e02ece, 0xcea987e0, 0x7d87d156, 0x567dac87, 0x192bcce7, 0xe719d52b, 0x62a613b5, 0xb56271a6, 0xe6317c4d, 0x4de69a31, 0x9ab559ec, 0xec9ac3b5, 0x45cf408f, 0x8f4505cf, 0x9dbca31f, 0x1f9d3ebc, 0x40c04989, 0x894009c0, 0x879268fa, 0xfa87ef92, 0x153fd0ef, 0xef15c53f, 0xeb2694b2, 0xb2eb7f26, 0xc940ce8e, 0x8ec90740, 0xb1de6fb, 0xfb0bed1d, 0xec2f6e41, 0x41ec822f, 0x67a91ab3, 0xb3677da9, 0xfd1c435f, 0x5ffdbe1c, 0xea256045, 0x45ea8a25, 0xbfdaf923, 0x23bf46da, 0xf7025153, 0x53f7a602, 0x96a145e4, 0xe496d3a1, 0x5bed769b, 0x9b5b2ded, 0xc25d2875, 0x75c2ea5d, 0x1c24c5e1, 0xe11cd924, 0xaee9d43d, 0x3dae7ae9, 0x6abef24c, 0x4c6a98be, 0x5aee826c, 0x6c5ad8ee, 0x41c3bd7e, 0x7e41fcc3, 0x206f3f5, 0xf502f106, 0x4fd15283, 0x834f1dd1, 0x5ce48c68, 0x685cd0e4, 0xf4075651, 0x51f4a207, 0x345c8dd1, 0xd134b95c, 0x818e1f9, 0xf908e918, 0x93ae4ce2, 0xe293dfae, 0x73953eab, 0xab734d95, 0x53f59762, 0x6253c4f5, 0x3f416b2a, 0x2a3f5441, 0xc141c08, 0x80c1014, 0x52f66395, 0x955231f6, 0x65afe946, 0x46658caf, 0x5ee27f9d, 0x9d5e21e2, 0x28784830, 0x30286078, 0xa1f8cf37, 0x37a16ef8, 0xf111b0a, 0xa0f1411, 0xb5c4eb2f, 0x2fb55ec4, 0x91b150e, 0xe091c1b, 0x365a7e24, 0x2436485a, 0x9bb6ad1b, 0x1b9b36b6, 0x3d4798df, 0xdf3da547, 0x266aa7cd, 0xcd26816a, 0x69bbf54e, 0x4e699cbb, 0xcd4c337f, 0x7fcdfe4c, 0x9fba50ea, 0xea9fcfba, 0x1b2d3f12, 0x121b242d, 0x9eb9a41d, 0x1d9e3ab9, 0x749cc458, 0x5874b09c, 0x2e724634, 0x342e6872, 0x2d774136, 0x362d6c77, 0xb2cd11dc, 0xdcb2a3cd, 0xee299db4, 0xb4ee7329, 0xfb164d5b, 0x5bfbb616, 0xf601a5a4, 0xa4f65301, 0x4dd7a176, 0x764decd7, 0x61a314b7, 0xb76175a3, 0xce49347d, 0x7dcefa49, 0x7b8ddf52, 0x527ba48d, 0x3e429fdd, 0xdd3ea142, 0x7193cd5e, 0x5e71bc93, 0x97a2b113, 0x139726a2, 0xf504a2a6, 0xa6f55704, 0x68b801b9, 0xb96869b8, 0x0, 0x0, 0x2c74b5c1, 0xc12c9974, 0x60a0e040, 0x406080a0, 0x1f21c2e3, 0xe31fdd21, 0xc8433a79, 0x79c8f243, 0xed2c9ab6, 0xb6ed772c, 0xbed90dd4, 0xd4beb3d9, 0x46ca478d, 0x8d4601ca, 0xd9701767, 0x67d9ce70, 0x4bddaf72, 0x724be4dd, 0xde79ed94, 0x94de3379, 0xd467ff98, 0x98d42b67, 0xe82393b0, 0xb0e87b23, 0x4ade5b85, 0x854a11de, 0x6bbd06bb, 0xbb6b6dbd, 0x2a7ebbc5, 0xc52a917e, 0xe5347b4f, 0x4fe59e34, 0x163ad7ed, 0xed16c13a, 0xc554d286, 0x86c51754, 0xd762f89a, 0x9ad72f62, 0x55ff9966, 0x6655ccff, 0x94a7b611, 0x119422a7, 0xcf4ac08a, 0x8acf0f4a, 0x1030d9e9, 0xe910c930, 0x60a0e04, 0x406080a, 0x819866fe, 0xfe81e798, 0xf00baba0, 0xa0f05b0b, 0x44ccb478, 0x7844f0cc, 0xbad5f025, 0x25ba4ad5, 0xe33e754b, 0x4be3963e, 0xf30eaca2, 0xa2f35f0e, 0xfe19445d, 0x5dfeba19, 0xc05bdb80, 0x80c01b5b, 0x8a858005, 0x58a0a85, 0xadecd33f, 0x3fad7eec, 0xbcdffe21, 0x21bc42df, 0x48d8a870, 0x7048e0d8, 0x40cfdf1, 0xf104f90c, 0xdf7a1963, 0x63dfc67a, 0xc1582f77, 0x77c1ee58, 0x759f30af, 0xaf75459f, 0x63a5e742, 0x426384a5, 0x30507020, 0x20304050, 0x1a2ecbe5, 0xe51ad12e, 0xe12effd, 0xfd0ee112, 0x6db708bf, 0xbf6d65b7, 0x4cd45581, 0x814c19d4, 0x143c2418, 0x1814303c, 0x355f7926, 0x26354c5f, 0x2f71b2c3, 0xc32f9d71, 0xe13886be, 0xbee16738, 0xa2fdc835, 0x35a26afd, 0xcc4fc788, 0x88cc0b4f, 0x394b652e, 0x2e395c4b, 0x57f96a93, 0x93573df9, 0xf20d5855, 0x55f2aa0d, 0x829d61fc, 0xfc82e39d, 0x47c9b37a, 0x7a47f4c9, 0xacef27c8, 0xc8ac8bef, 0xe73288ba, 0xbae76f32, 0x2b7d4f32, 0x322b647d, 0x95a442e6, 0xe695d7a4, 0xa0fb3bc0, 0xc0a09bfb, 0x98b3aa19, 0x199832b3, 0xd168f69e, 0x9ed12768, 0x7f8122a3, 0xa37f5d81, 0x66aaee44, 0x446688aa, 0x7e82d654, 0x547ea882, 0xabe6dd3b, 0x3bab76e6, 0x839e950b, 0xb83169e, 0xca45c98c, 0x8cca0345, 0x297bbcc7, 0xc729957b, 0xd36e056b, 0x6bd3d66e, 0x3c446c28, 0x283c5044, 0x798b2ca7, 0xa779558b, 0xe23d81bc, 0xbce2633d, 0x1d273116, 0x161d2c27, 0x769a37ad, 0xad76419a, 0x3b4d96db, 0xdb3bad4d, 0x56fa9e64, 0x6456c8fa, 0x4ed2a674, 0x744ee8d2, 0x1e223614, 0x141e2822, 0xdb76e492, 0x92db3f76, 0xa1e120c, 0xc0a181e, 0x6cb4fc48, 0x486c90b4, 0xe4378fb8, 0xb8e46b37, 0x5de7789f, 0x9f5d25e7, 0x6eb20fbd, 0xbd6e61b2, 0xef2a6943, 0x43ef862a, 0xa6f135c4, 0xc4a693f1, 0xa8e3da39, 0x39a872e3, 0xa4f7c631, 0x31a462f7, 0x37598ad3, 0xd337bd59, 0x8b8674f2, 0xf28bff86, 0x325683d5, 0xd532b156, 0x43c54e8b, 0x8b430dc5, 0x59eb856e, 0x6e59dceb, 0xb7c218da, 0xdab7afc2, 0x8c8f8e01, 0x18c028f, 0x64ac1db1, 0xb16479ac, 0xd26df19c, 0x9cd2236d, 0xe03b7249, 0x49e0923b, 0xb4c71fd8, 0xd8b4abc7, 0xfa15b9ac, 0xacfa4315, 0x709faf3, 0xf307fd09, 0x256fa0cf, 0xcf25856f, 0xafea20ca, 0xcaaf8fea, 0x8e897df4, 0xf48ef389, 0xe9206747, 0x47e98e20, 0x18283810, 0x10182028, 0xd5640b6f, 0x6fd5de64, 0x888373f0, 0xf088fb83, 0x6fb1fb4a, 0x4a6f94b1, 0x7296ca5c, 0x5c72b896, 0x246c5438, 0x3824706c, 0xf1085f57, 0x57f1ae08, 0xc7522173, 0x73c7e652, 0x51f36497, 0x975135f3, 0x2365aecb, 0xcb238d65, 0x7c8425a1, 0xa17c5984, 0x9cbf57e8, 0xe89ccbbf, 0x21635d3e, 0x3e217c63, 0xdd7cea96, 0x96dd377c, 0xdc7f1e61, 0x61dcc27f, 0x86919c0d, 0xd861a91, 0x85949b0f, 0xf851e94, 0x90ab4be0, 0xe090dbab, 0x42c6ba7c, 0x7c42f8c6, 0xc4572671, 0x71c4e257, 0xaae529cc, 0xccaa83e5, 0xd873e390, 0x90d83b73, 0x50f0906, 0x6050c0f, 0x103f4f7, 0xf701f503, 0x12362a1c, 0x1c123836, 0xa3fe3cc2, 0xc2a39ffe, 0x5fe18b6a, 0x6a5fd4e1, 0xf910beae, 0xaef94710, 0xd06b0269, 0x69d0d26b, 0x91a8bf17, 0x17912ea8, 0x58e87199, 0x995829e8, 0x2769533a, 0x3a277469, 0xb9d0f727, 0x27b94ed0, 0x384891d9, 0xd938a948, 0x1335deeb, 0xeb13cd35, 0xb3cee52b, 0x2bb356ce, 0x33557722, 0x22334455, 0xbbd604d2, 0xd2bbbfd6, 0x709039a9, 0xa9704990, 0x89808707, 0x7890e80, 0xa7f2c133, 0x33a766f2, 0xb6c1ec2d, 0x2db65ac1, 0x22665a3c, 0x3c227866, 0x92adb815, 0x15922aad, 0x2060a9c9, 0xc9208960, 0x49db5c87, 0x874915db, 0xff1ab0aa, 0xaaff4f1a, 0x7888d850, 0x5078a088, 0x7a8e2ba5, 0xa57a518e, 0x8f8a8903, 0x38f068a, 0xf8134a59, 0x59f8b213, 0x809b9209, 0x980129b, 0x1739231a, 0x1a173439, 0xda751065, 0x65daca75, 0x315384d7, 0xd731b553, 0xc651d584, 0x84c61351, 0xb8d303d0, 0xd0b8bbd3, 0xc35edc82, 0x82c31f5e, 0xb0cbe229, 0x29b052cb, 0x7799c35a, 0x5a77b499, 0x11332d1e, 0x1e113c33, 0xcb463d7b, 0x7bcbf646, 0xfc1fb7a8, 0xa8fc4b1f, 0xd6610c6d, 0x6dd6da61, 0x3a4e622c, 0x2c3a584e}; + +#define GROESTL_ROTATE_COLUMN_DOWN(v1, v2, amount_bytes, temp_var) \ + { \ + temp_var = (v1 << (8 * amount_bytes)) | (v2 >> (8 * (4 - amount_bytes))); \ + v2 = (v2 << (8 * amount_bytes)) | (v1 >> (8 * (4 - amount_bytes))); \ + v1 = temp_var; \ + } + +#define GROESTL_COLUMN(x, y, i, c0, c1, c2, c3, c4, c5, c6, c7, tv1, tv2, tu, tl, t) \ + tu = d_groestl_T[2 * (uint32_t)x[4 * c0 + 0]]; \ + tl = d_groestl_T[2 * (uint32_t)x[4 * c0 + 0] + 1]; \ + tv1 = d_groestl_T[2 * (uint32_t)x[4 * c1 + 1]]; \ + tv2 = d_groestl_T[2 * (uint32_t)x[4 * c1 + 1] + 1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 1, t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tv1 = d_groestl_T[2 * (uint32_t)x[4 * c2 + 2]]; \ + tv2 = d_groestl_T[2 * (uint32_t)x[4 * c2 + 2] + 1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 2, t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tv1 = d_groestl_T[2 * (uint32_t)x[4 * c3 + 3]]; \ + tv2 = d_groestl_T[2 * (uint32_t)x[4 * c3 + 3] + 1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 3, t) \ + tu ^= tv1; \ + tl ^= tv2; \ + tl ^= d_groestl_T[2 * (uint32_t)x[4 * c4 + 0]]; \ + tu ^= d_groestl_T[2 * (uint32_t)x[4 * c4 + 0] + 1]; \ + tv1 = d_groestl_T[2 * (uint32_t)x[4 * c5 + 1]]; \ + tv2 = d_groestl_T[2 * (uint32_t)x[4 * c5 + 1] + 1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 1, t) \ + tl ^= tv1; \ + tu ^= tv2; \ + tv1 = d_groestl_T[2 * (uint32_t)x[4 * c6 + 2]]; \ + tv2 = d_groestl_T[2 * (uint32_t)x[4 * c6 + 2] + 1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 2, t) \ + tl ^= tv1; \ + tu ^= tv2; \ + tv1 = d_groestl_T[2 * (uint32_t)x[4 * c7 + 3]]; \ + tv2 = d_groestl_T[2 * (uint32_t)x[4 * c7 + 3] + 1]; \ + GROESTL_ROTATE_COLUMN_DOWN(tv1, tv2, 3, t) \ + tl ^= tv1; \ + tu ^= tv2; \ + y[i] = tu; \ + y[i + 1] = tl; + +__device__ void cn_groestl_RND512P(uint8_t* __restrict__ x, uint32_t* __restrict__ y, uint32_t r) { uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; uint32_t* x32 = (uint32_t*)x; - x32[ 0] ^= 0x00000000^r; - x32[ 2] ^= 0x00000010^r; - x32[ 4] ^= 0x00000020^r; - x32[ 6] ^= 0x00000030^r; - x32[ 8] ^= 0x00000040^r; - x32[10] ^= 0x00000050^r; - x32[12] ^= 0x00000060^r; - x32[14] ^= 0x00000070^r; - GROESTL_COLUMN(x,y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y,10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y,12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y,14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + x32[0] ^= 0x00000000 ^ r; + x32[2] ^= 0x00000010 ^ r; + x32[4] ^= 0x00000020 ^ r; + x32[6] ^= 0x00000030 ^ r; + x32[8] ^= 0x00000040 ^ r; + x32[10] ^= 0x00000050 ^ r; + x32[12] ^= 0x00000060 ^ r; + x32[14] ^= 0x00000070 ^ r; + GROESTL_COLUMN(x, y, 0, 0, 2, 4, 6, 9, 11, 13, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 2, 2, 4, 6, 8, 11, 13, 15, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 4, 4, 6, 8, 10, 13, 15, 1, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 6, 6, 8, 10, 12, 15, 1, 3, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 8, 8, 10, 12, 14, 1, 3, 5, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 10, 10, 12, 14, 0, 3, 5, 7, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 12, 12, 14, 0, 2, 5, 7, 9, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 14, 14, 0, 2, 4, 7, 9, 11, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); } -__device__ void cn_groestl_RND512Q(uint8_t * __restrict__ x, uint32_t * __restrict__ y, uint32_t r) +__device__ void cn_groestl_RND512Q(uint8_t* __restrict__ x, uint32_t* __restrict__ y, uint32_t r) { uint32_t temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp; uint32_t* x32 = (uint32_t*)x; - x32[ 0] = ~x32[ 0]; - x32[ 1] ^= 0xffffffff^r; - x32[ 2] = ~x32[ 2]; - x32[ 3] ^= 0xefffffff^r; - x32[ 4] = ~x32[ 4]; - x32[ 5] ^= 0xdfffffff^r; - x32[ 6] = ~x32[ 6]; - x32[ 7] ^= 0xcfffffff^r; - x32[ 8] = ~x32[ 8]; - x32[ 9] ^= 0xbfffffff^r; + x32[0] = ~x32[0]; + x32[1] ^= 0xffffffff ^ r; + x32[2] = ~x32[2]; + x32[3] ^= 0xefffffff ^ r; + x32[4] = ~x32[4]; + x32[5] ^= 0xdfffffff ^ r; + x32[6] = ~x32[6]; + x32[7] ^= 0xcfffffff ^ r; + x32[8] = ~x32[8]; + x32[9] ^= 0xbfffffff ^ r; x32[10] = ~x32[10]; - x32[11] ^= 0xafffffff^r; + x32[11] ^= 0xafffffff ^ r; x32[12] = ~x32[12]; - x32[13] ^= 0x9fffffff^r; + x32[13] ^= 0x9fffffff ^ r; x32[14] = ~x32[14]; - x32[15] ^= 0x8fffffff^r; - GROESTL_COLUMN(x,y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y,10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y,12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); - GROESTL_COLUMN(x,y,14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + x32[15] ^= 0x8fffffff ^ r; + GROESTL_COLUMN(x, y, 0, 2, 6, 10, 14, 1, 5, 9, 13, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 2, 4, 8, 12, 0, 3, 7, 11, 15, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 4, 6, 10, 14, 2, 5, 9, 13, 1, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 6, 8, 12, 0, 4, 7, 11, 15, 3, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 8, 10, 14, 2, 6, 9, 13, 1, 5, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 10, 12, 0, 4, 8, 11, 15, 3, 7, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 12, 14, 2, 6, 10, 13, 1, 5, 9, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); + GROESTL_COLUMN(x, y, 14, 0, 4, 8, 12, 15, 3, 7, 11, temp_v1, temp_v2, temp_upper_value, temp_lower_value, temp); } -__device__ void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __restrict__ m) +__device__ void cn_groestl_F512(uint32_t* __restrict__ h, const uint32_t* __restrict__ m) { int i; - uint32_t Ptmp[2*GROESTL_COLS512]; - uint32_t Qtmp[2*GROESTL_COLS512]; - uint32_t y[2*GROESTL_COLS512]; - uint32_t z[2*GROESTL_COLS512]; + uint32_t Ptmp[2 * GROESTL_COLS512]; + uint32_t Qtmp[2 * GROESTL_COLS512]; + uint32_t y[2 * GROESTL_COLS512]; + uint32_t z[2 * GROESTL_COLS512]; - for (i = 0; i < 2*GROESTL_COLS512; i++) + for(i = 0; i < 2 * GROESTL_COLS512; i++) { z[i] = m[i]; - Ptmp[i] = h[i]^m[i]; + Ptmp[i] = h[i] ^ m[i]; } cn_groestl_RND512Q((uint8_t*)z, y, 0x00000000); @@ -195,18 +164,18 @@ __device__ void cn_groestl_F512(uint32_t * __restrict__ h, const uint32_t * __re cn_groestl_RND512P((uint8_t*)z, y, 0x00000008); cn_groestl_RND512P((uint8_t*)y, Ptmp, 0x00000009); - for (i = 0; i < 2*GROESTL_COLS512; i++) - h[i] ^= Ptmp[i]^Qtmp[i]; + for(i = 0; i < 2 * GROESTL_COLS512; i++) + h[i] ^= Ptmp[i] ^ Qtmp[i]; } -__device__ void cn_groestl_outputtransformation(groestlHashState *ctx) +__device__ void cn_groestl_outputtransformation(groestlHashState* ctx) { int j; - uint32_t temp[2*GROESTL_COLS512]; - uint32_t y[2*GROESTL_COLS512]; - uint32_t z[2*GROESTL_COLS512]; + uint32_t temp[2 * GROESTL_COLS512]; + uint32_t y[2 * GROESTL_COLS512]; + uint32_t z[2 * GROESTL_COLS512]; - for (j = 0; j < 2*GROESTL_COLS512; j++) + for(j = 0; j < 2 * GROESTL_COLS512; j++) temp[j] = ctx->chaining[j]; cn_groestl_RND512P((uint8_t*)temp, y, 0x00000000); @@ -220,33 +189,33 @@ __device__ void cn_groestl_outputtransformation(groestlHashState *ctx) cn_groestl_RND512P((uint8_t*)z, y, 0x00000008); cn_groestl_RND512P((uint8_t*)y, temp, 0x00000009); - for (j = 0; j < 2*GROESTL_COLS512; j++) + for(j = 0; j < 2 * GROESTL_COLS512; j++) ctx->chaining[j] ^= temp[j]; } -__device__ void cn_groestl_transform(groestlHashState * __restrict__ ctx, - const uint8_t * __restrict__ input, int msglen) +__device__ void cn_groestl_transform(groestlHashState* __restrict__ ctx, + const uint8_t* __restrict__ input, int msglen) { - for (; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512) + for(; msglen >= GROESTL_SIZE512; msglen -= GROESTL_SIZE512, input += GROESTL_SIZE512) { - cn_groestl_F512(ctx->chaining,(uint32_t*)input); + cn_groestl_F512(ctx->chaining, (uint32_t*)input); ctx->block_counter1++; - if (ctx->block_counter1 == 0) + if(ctx->block_counter1 == 0) ctx->block_counter2++; } } -__device__ void cn_groestl_final(groestlHashState* __restrict__ ctx, - BitSequence* __restrict__ output) +__device__ void cn_groestl_final(groestlHashState* __restrict__ ctx, + BitSequence* __restrict__ output) { - int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN/8; - uint8_t *s = (BitSequence*)ctx->chaining; + int i, j = 0, hashbytelen = GROESTL_HASH_BIT_LEN / 8; + uint8_t* s = (BitSequence*)ctx->chaining; - if (ctx->bits_in_last_byte) + if(ctx->bits_in_last_byte) { - ctx->buffer[(int)ctx->buf_ptr-1] &= ((1<bits_in_last_byte)-1)<<(8-ctx->bits_in_last_byte); - ctx->buffer[(int)ctx->buf_ptr-1] ^= 0x1<<(7-ctx->bits_in_last_byte); + ctx->buffer[(int)ctx->buf_ptr - 1] &= ((1 << ctx->bits_in_last_byte) - 1) << (8 - ctx->bits_in_last_byte); + ctx->buffer[(int)ctx->buf_ptr - 1] ^= 0x1 << (7 - ctx->bits_in_last_byte); ctx->bits_in_last_byte = 0; } else @@ -254,29 +223,29 @@ __device__ void cn_groestl_final(groestlHashState* __restrict__ ctx, ctx->buffer[(int)ctx->buf_ptr++] = 0x80; } - if (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) + if(ctx->buf_ptr > GROESTL_SIZE512 - GROESTL_LENGTHFIELDLEN) { - while (ctx->buf_ptr < GROESTL_SIZE512) + while(ctx->buf_ptr < GROESTL_SIZE512) ctx->buffer[(int)ctx->buf_ptr++] = 0; cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512); ctx->buf_ptr = 0; } - while (ctx->buf_ptr < GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) + while(ctx->buf_ptr < GROESTL_SIZE512 - GROESTL_LENGTHFIELDLEN) ctx->buffer[(int)ctx->buf_ptr++] = 0; ctx->block_counter1++; - if (ctx->block_counter1 == 0) + if(ctx->block_counter1 == 0) ctx->block_counter2++; ctx->buf_ptr = GROESTL_SIZE512; - while (ctx->buf_ptr > GROESTL_SIZE512-(int)sizeof(uint32_t)) + while(ctx->buf_ptr > GROESTL_SIZE512 - (int)sizeof(uint32_t)) { ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter1; ctx->block_counter1 >>= 8; } - while (ctx->buf_ptr > GROESTL_SIZE512-GROESTL_LENGTHFIELDLEN) + while(ctx->buf_ptr > GROESTL_SIZE512 - GROESTL_LENGTHFIELDLEN) { ctx->buffer[(int)--ctx->buf_ptr] = (uint8_t)ctx->block_counter2; ctx->block_counter2 >>= 8; @@ -284,12 +253,12 @@ __device__ void cn_groestl_final(groestlHashState* __restrict__ ctx, cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512); cn_groestl_outputtransformation(ctx); - for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) + for(i = GROESTL_SIZE512 - hashbytelen; i < GROESTL_SIZE512; i++, j++) output[j] = s[i]; - for (i = 0; i < GROESTL_COLS512; i++) + for(i = 0; i < GROESTL_COLS512; i++) ctx->chaining[i] = 0; - for (i = 0; i < GROESTL_SIZE512; i++) + for(i = 0; i < GROESTL_SIZE512; i++) ctx->buffer[i] = 0; } @@ -297,17 +266,17 @@ __device__ void cn_groestl_update(groestlHashState* __restrict__ ctx, const BitSequence* __restrict__ input, DataLength databitlen) { int index = 0; - int msglen = (int)(databitlen/8); - int rem = (int)(databitlen%8); + int msglen = (int)(databitlen / 8); + int rem = (int)(databitlen % 8); - if (ctx->buf_ptr) + if(ctx->buf_ptr) { - while (ctx->buf_ptr < GROESTL_SIZE512 && index < msglen) + while(ctx->buf_ptr < GROESTL_SIZE512 && index < msglen) ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; - if (ctx->buf_ptr < GROESTL_SIZE512) + if(ctx->buf_ptr < GROESTL_SIZE512) { - if (rem) + if(rem) { ctx->bits_in_last_byte = rem; ctx->buffer[(int)ctx->buf_ptr++] = input[index]; @@ -319,13 +288,13 @@ __device__ void cn_groestl_update(groestlHashState* __restrict__ ctx, cn_groestl_transform(ctx, ctx->buffer, GROESTL_SIZE512); } - cn_groestl_transform(ctx, input+index, msglen-index); - index += ((msglen-index)/GROESTL_SIZE512)*GROESTL_SIZE512; + cn_groestl_transform(ctx, input + index, msglen - index); + index += ((msglen - index) / GROESTL_SIZE512) * GROESTL_SIZE512; - while (index < msglen) + while(index < msglen) ctx->buffer[(int)ctx->buf_ptr++] = input[index++]; - if (rem) + if(rem) { ctx->bits_in_last_byte = rem; ctx->buffer[(int)ctx->buf_ptr++] = input[index]; @@ -336,17 +305,17 @@ __device__ void cn_groestl_init(groestlHashState* ctx) { int i = 0; - for(;i<(GROESTL_SIZE512/sizeof(uint32_t));i++) + for(; i < (GROESTL_SIZE512 / sizeof(uint32_t)); i++) ctx->chaining[i] = 0; - ctx->chaining[2*GROESTL_COLS512-1] = u32BIG((uint32_t)GROESTL_HASH_BIT_LEN); + ctx->chaining[2 * GROESTL_COLS512 - 1] = u32BIG((uint32_t)GROESTL_HASH_BIT_LEN); ctx->buf_ptr = 0; ctx->block_counter1 = 0; ctx->block_counter2 = 0; ctx->bits_in_last_byte = 0; } -__device__ void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +__device__ void cn_groestl(const BitSequence* __restrict__ data, DataLength len, BitSequence* __restrict__ hashval) { DataLength databitlen = len << 3; groestlHashState context; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp index 284039ff4..1019a9b9c 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_jh.hpp @@ -1,6 +1,7 @@ #include -typedef struct { +typedef struct +{ int hashbitlen; unsigned long long databitlen; unsigned long long datasize_in_buffer; @@ -9,159 +10,175 @@ typedef struct { } jhHashState; __constant__ unsigned char d_JH256_H0[512] = -{ - 0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1, - 0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3, - 0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77, - 0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8, - 0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62, - 0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c, - 0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf, - 0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69 -}; + { + 0xeb, 0x98, 0xa3, 0x41, 0x2c, 0x20, 0xd3, 0xeb, 0x92, 0xcd, 0xbe, 0x7b, 0x9c, 0xb2, 0x45, 0xc1, + 0x1c, 0x93, 0x51, 0x91, 0x60, 0xd4, 0xc7, 0xfa, 0x26, 0x0, 0x82, 0xd6, 0x7e, 0x50, 0x8a, 0x3, + 0xa4, 0x23, 0x9e, 0x26, 0x77, 0x26, 0xb9, 0x45, 0xe0, 0xfb, 0x1a, 0x48, 0xd4, 0x1a, 0x94, 0x77, + 0xcd, 0xb5, 0xab, 0x26, 0x2, 0x6b, 0x17, 0x7a, 0x56, 0xf0, 0x24, 0x42, 0xf, 0xff, 0x2f, 0xa8, + 0x71, 0xa3, 0x96, 0x89, 0x7f, 0x2e, 0x4d, 0x75, 0x1d, 0x14, 0x49, 0x8, 0xf7, 0x7d, 0xe2, 0x62, + 0x27, 0x76, 0x95, 0xf7, 0x76, 0x24, 0x8f, 0x94, 0x87, 0xd5, 0xb6, 0x57, 0x47, 0x80, 0x29, 0x6c, + 0x5c, 0x5e, 0x27, 0x2d, 0xac, 0x8e, 0xd, 0x6c, 0x51, 0x84, 0x50, 0xc6, 0x57, 0x5, 0x7a, 0xf, + 0x7b, 0xe4, 0xd3, 0x67, 0x70, 0x24, 0x12, 0xea, 0x89, 0xe3, 0xab, 0x13, 0xd3, 0x1c, 0xd7, 0x69}; __constant__ unsigned char d_E8_rc[42][32] = -{ - {0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40}, - {0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31}, - {0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc}, - {0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3}, - {0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23}, - {0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97}, - {0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14}, - {0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4}, - {0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36}, - {0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f}, - {0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b}, - {0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62}, - {0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5}, - {0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f}, - {0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a}, - {0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf}, - {0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0}, - {0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a}, - {0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6}, - {0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67}, - {0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18}, - {0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e}, - {0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1}, - {0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83}, - {0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef}, - {0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65}, - {0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c}, - {0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71}, - {0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0}, - {0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f}, - {0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad}, - {0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6}, - {0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63}, - {0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f}, - {0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a}, - {0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5}, - {0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48}, - {0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e}, - {0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7}, - {0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde}, - {0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a}, - {0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2} -}; - -#define JH_SWAP1(x) (x) = ((((x) & 0x5555555555555555ULL) << 1) | (((x) & 0xaaaaaaaaaaaaaaaaULL) >> 1)); -#define JH_SWAP2(x) (x) = ((((x) & 0x3333333333333333ULL) << 2) | (((x) & 0xccccccccccccccccULL) >> 2)); -#define JH_SWAP4(x) (x) = ((((x) & 0x0f0f0f0f0f0f0f0fULL) << 4) | (((x) & 0xf0f0f0f0f0f0f0f0ULL) >> 4)); -#define JH_SWAP8(x) (x) = ((((x) & 0x00ff00ff00ff00ffULL) << 8) | (((x) & 0xff00ff00ff00ff00ULL) >> 8)); -#define JH_SWAP16(x) (x) = ((((x) & 0x0000ffff0000ffffULL) << 16) | (((x) & 0xffff0000ffff0000ULL) >> 16)); -#define JH_SWAP32(x) (x) = (((x) << 32) | ((x) >> 32)); - -#define JH_L(m0,m1,m2,m3,m4,m5,m6,m7) \ - (m4) ^= (m1); \ - (m5) ^= (m2); \ - (m6) ^= (m0) ^ (m3); \ - (m7) ^= (m0); \ - (m0) ^= (m5); \ - (m1) ^= (m6); \ - (m2) ^= (m4) ^ (m7); \ + { + {0x72, 0xd5, 0xde, 0xa2, 0xdf, 0x15, 0xf8, 0x67, 0x7b, 0x84, 0x15, 0xa, 0xb7, 0x23, 0x15, 0x57, 0x81, 0xab, 0xd6, 0x90, 0x4d, 0x5a, 0x87, 0xf6, 0x4e, 0x9f, 0x4f, 0xc5, 0xc3, 0xd1, 0x2b, 0x40}, + {0xea, 0x98, 0x3a, 0xe0, 0x5c, 0x45, 0xfa, 0x9c, 0x3, 0xc5, 0xd2, 0x99, 0x66, 0xb2, 0x99, 0x9a, 0x66, 0x2, 0x96, 0xb4, 0xf2, 0xbb, 0x53, 0x8a, 0xb5, 0x56, 0x14, 0x1a, 0x88, 0xdb, 0xa2, 0x31}, + {0x3, 0xa3, 0x5a, 0x5c, 0x9a, 0x19, 0xe, 0xdb, 0x40, 0x3f, 0xb2, 0xa, 0x87, 0xc1, 0x44, 0x10, 0x1c, 0x5, 0x19, 0x80, 0x84, 0x9e, 0x95, 0x1d, 0x6f, 0x33, 0xeb, 0xad, 0x5e, 0xe7, 0xcd, 0xdc}, + {0x10, 0xba, 0x13, 0x92, 0x2, 0xbf, 0x6b, 0x41, 0xdc, 0x78, 0x65, 0x15, 0xf7, 0xbb, 0x27, 0xd0, 0xa, 0x2c, 0x81, 0x39, 0x37, 0xaa, 0x78, 0x50, 0x3f, 0x1a, 0xbf, 0xd2, 0x41, 0x0, 0x91, 0xd3}, + {0x42, 0x2d, 0x5a, 0xd, 0xf6, 0xcc, 0x7e, 0x90, 0xdd, 0x62, 0x9f, 0x9c, 0x92, 0xc0, 0x97, 0xce, 0x18, 0x5c, 0xa7, 0xb, 0xc7, 0x2b, 0x44, 0xac, 0xd1, 0xdf, 0x65, 0xd6, 0x63, 0xc6, 0xfc, 0x23}, + {0x97, 0x6e, 0x6c, 0x3, 0x9e, 0xe0, 0xb8, 0x1a, 0x21, 0x5, 0x45, 0x7e, 0x44, 0x6c, 0xec, 0xa8, 0xee, 0xf1, 0x3, 0xbb, 0x5d, 0x8e, 0x61, 0xfa, 0xfd, 0x96, 0x97, 0xb2, 0x94, 0x83, 0x81, 0x97}, + {0x4a, 0x8e, 0x85, 0x37, 0xdb, 0x3, 0x30, 0x2f, 0x2a, 0x67, 0x8d, 0x2d, 0xfb, 0x9f, 0x6a, 0x95, 0x8a, 0xfe, 0x73, 0x81, 0xf8, 0xb8, 0x69, 0x6c, 0x8a, 0xc7, 0x72, 0x46, 0xc0, 0x7f, 0x42, 0x14}, + {0xc5, 0xf4, 0x15, 0x8f, 0xbd, 0xc7, 0x5e, 0xc4, 0x75, 0x44, 0x6f, 0xa7, 0x8f, 0x11, 0xbb, 0x80, 0x52, 0xde, 0x75, 0xb7, 0xae, 0xe4, 0x88, 0xbc, 0x82, 0xb8, 0x0, 0x1e, 0x98, 0xa6, 0xa3, 0xf4}, + {0x8e, 0xf4, 0x8f, 0x33, 0xa9, 0xa3, 0x63, 0x15, 0xaa, 0x5f, 0x56, 0x24, 0xd5, 0xb7, 0xf9, 0x89, 0xb6, 0xf1, 0xed, 0x20, 0x7c, 0x5a, 0xe0, 0xfd, 0x36, 0xca, 0xe9, 0x5a, 0x6, 0x42, 0x2c, 0x36}, + {0xce, 0x29, 0x35, 0x43, 0x4e, 0xfe, 0x98, 0x3d, 0x53, 0x3a, 0xf9, 0x74, 0x73, 0x9a, 0x4b, 0xa7, 0xd0, 0xf5, 0x1f, 0x59, 0x6f, 0x4e, 0x81, 0x86, 0xe, 0x9d, 0xad, 0x81, 0xaf, 0xd8, 0x5a, 0x9f}, + {0xa7, 0x5, 0x6, 0x67, 0xee, 0x34, 0x62, 0x6a, 0x8b, 0xb, 0x28, 0xbe, 0x6e, 0xb9, 0x17, 0x27, 0x47, 0x74, 0x7, 0x26, 0xc6, 0x80, 0x10, 0x3f, 0xe0, 0xa0, 0x7e, 0x6f, 0xc6, 0x7e, 0x48, 0x7b}, + {0xd, 0x55, 0xa, 0xa5, 0x4a, 0xf8, 0xa4, 0xc0, 0x91, 0xe3, 0xe7, 0x9f, 0x97, 0x8e, 0xf1, 0x9e, 0x86, 0x76, 0x72, 0x81, 0x50, 0x60, 0x8d, 0xd4, 0x7e, 0x9e, 0x5a, 0x41, 0xf3, 0xe5, 0xb0, 0x62}, + {0xfc, 0x9f, 0x1f, 0xec, 0x40, 0x54, 0x20, 0x7a, 0xe3, 0xe4, 0x1a, 0x0, 0xce, 0xf4, 0xc9, 0x84, 0x4f, 0xd7, 0x94, 0xf5, 0x9d, 0xfa, 0x95, 0xd8, 0x55, 0x2e, 0x7e, 0x11, 0x24, 0xc3, 0x54, 0xa5}, + {0x5b, 0xdf, 0x72, 0x28, 0xbd, 0xfe, 0x6e, 0x28, 0x78, 0xf5, 0x7f, 0xe2, 0xf, 0xa5, 0xc4, 0xb2, 0x5, 0x89, 0x7c, 0xef, 0xee, 0x49, 0xd3, 0x2e, 0x44, 0x7e, 0x93, 0x85, 0xeb, 0x28, 0x59, 0x7f}, + {0x70, 0x5f, 0x69, 0x37, 0xb3, 0x24, 0x31, 0x4a, 0x5e, 0x86, 0x28, 0xf1, 0x1d, 0xd6, 0xe4, 0x65, 0xc7, 0x1b, 0x77, 0x4, 0x51, 0xb9, 0x20, 0xe7, 0x74, 0xfe, 0x43, 0xe8, 0x23, 0xd4, 0x87, 0x8a}, + {0x7d, 0x29, 0xe8, 0xa3, 0x92, 0x76, 0x94, 0xf2, 0xdd, 0xcb, 0x7a, 0x9, 0x9b, 0x30, 0xd9, 0xc1, 0x1d, 0x1b, 0x30, 0xfb, 0x5b, 0xdc, 0x1b, 0xe0, 0xda, 0x24, 0x49, 0x4f, 0xf2, 0x9c, 0x82, 0xbf}, + {0xa4, 0xe7, 0xba, 0x31, 0xb4, 0x70, 0xbf, 0xff, 0xd, 0x32, 0x44, 0x5, 0xde, 0xf8, 0xbc, 0x48, 0x3b, 0xae, 0xfc, 0x32, 0x53, 0xbb, 0xd3, 0x39, 0x45, 0x9f, 0xc3, 0xc1, 0xe0, 0x29, 0x8b, 0xa0}, + {0xe5, 0xc9, 0x5, 0xfd, 0xf7, 0xae, 0x9, 0xf, 0x94, 0x70, 0x34, 0x12, 0x42, 0x90, 0xf1, 0x34, 0xa2, 0x71, 0xb7, 0x1, 0xe3, 0x44, 0xed, 0x95, 0xe9, 0x3b, 0x8e, 0x36, 0x4f, 0x2f, 0x98, 0x4a}, + {0x88, 0x40, 0x1d, 0x63, 0xa0, 0x6c, 0xf6, 0x15, 0x47, 0xc1, 0x44, 0x4b, 0x87, 0x52, 0xaf, 0xff, 0x7e, 0xbb, 0x4a, 0xf1, 0xe2, 0xa, 0xc6, 0x30, 0x46, 0x70, 0xb6, 0xc5, 0xcc, 0x6e, 0x8c, 0xe6}, + {0xa4, 0xd5, 0xa4, 0x56, 0xbd, 0x4f, 0xca, 0x0, 0xda, 0x9d, 0x84, 0x4b, 0xc8, 0x3e, 0x18, 0xae, 0x73, 0x57, 0xce, 0x45, 0x30, 0x64, 0xd1, 0xad, 0xe8, 0xa6, 0xce, 0x68, 0x14, 0x5c, 0x25, 0x67}, + {0xa3, 0xda, 0x8c, 0xf2, 0xcb, 0xe, 0xe1, 0x16, 0x33, 0xe9, 0x6, 0x58, 0x9a, 0x94, 0x99, 0x9a, 0x1f, 0x60, 0xb2, 0x20, 0xc2, 0x6f, 0x84, 0x7b, 0xd1, 0xce, 0xac, 0x7f, 0xa0, 0xd1, 0x85, 0x18}, + {0x32, 0x59, 0x5b, 0xa1, 0x8d, 0xdd, 0x19, 0xd3, 0x50, 0x9a, 0x1c, 0xc0, 0xaa, 0xa5, 0xb4, 0x46, 0x9f, 0x3d, 0x63, 0x67, 0xe4, 0x4, 0x6b, 0xba, 0xf6, 0xca, 0x19, 0xab, 0xb, 0x56, 0xee, 0x7e}, + {0x1f, 0xb1, 0x79, 0xea, 0xa9, 0x28, 0x21, 0x74, 0xe9, 0xbd, 0xf7, 0x35, 0x3b, 0x36, 0x51, 0xee, 0x1d, 0x57, 0xac, 0x5a, 0x75, 0x50, 0xd3, 0x76, 0x3a, 0x46, 0xc2, 0xfe, 0xa3, 0x7d, 0x70, 0x1}, + {0xf7, 0x35, 0xc1, 0xaf, 0x98, 0xa4, 0xd8, 0x42, 0x78, 0xed, 0xec, 0x20, 0x9e, 0x6b, 0x67, 0x79, 0x41, 0x83, 0x63, 0x15, 0xea, 0x3a, 0xdb, 0xa8, 0xfa, 0xc3, 0x3b, 0x4d, 0x32, 0x83, 0x2c, 0x83}, + {0xa7, 0x40, 0x3b, 0x1f, 0x1c, 0x27, 0x47, 0xf3, 0x59, 0x40, 0xf0, 0x34, 0xb7, 0x2d, 0x76, 0x9a, 0xe7, 0x3e, 0x4e, 0x6c, 0xd2, 0x21, 0x4f, 0xfd, 0xb8, 0xfd, 0x8d, 0x39, 0xdc, 0x57, 0x59, 0xef}, + {0x8d, 0x9b, 0xc, 0x49, 0x2b, 0x49, 0xeb, 0xda, 0x5b, 0xa2, 0xd7, 0x49, 0x68, 0xf3, 0x70, 0xd, 0x7d, 0x3b, 0xae, 0xd0, 0x7a, 0x8d, 0x55, 0x84, 0xf5, 0xa5, 0xe9, 0xf0, 0xe4, 0xf8, 0x8e, 0x65}, + {0xa0, 0xb8, 0xa2, 0xf4, 0x36, 0x10, 0x3b, 0x53, 0xc, 0xa8, 0x7, 0x9e, 0x75, 0x3e, 0xec, 0x5a, 0x91, 0x68, 0x94, 0x92, 0x56, 0xe8, 0x88, 0x4f, 0x5b, 0xb0, 0x5c, 0x55, 0xf8, 0xba, 0xbc, 0x4c}, + {0xe3, 0xbb, 0x3b, 0x99, 0xf3, 0x87, 0x94, 0x7b, 0x75, 0xda, 0xf4, 0xd6, 0x72, 0x6b, 0x1c, 0x5d, 0x64, 0xae, 0xac, 0x28, 0xdc, 0x34, 0xb3, 0x6d, 0x6c, 0x34, 0xa5, 0x50, 0xb8, 0x28, 0xdb, 0x71}, + {0xf8, 0x61, 0xe2, 0xf2, 0x10, 0x8d, 0x51, 0x2a, 0xe3, 0xdb, 0x64, 0x33, 0x59, 0xdd, 0x75, 0xfc, 0x1c, 0xac, 0xbc, 0xf1, 0x43, 0xce, 0x3f, 0xa2, 0x67, 0xbb, 0xd1, 0x3c, 0x2, 0xe8, 0x43, 0xb0}, + {0x33, 0xa, 0x5b, 0xca, 0x88, 0x29, 0xa1, 0x75, 0x7f, 0x34, 0x19, 0x4d, 0xb4, 0x16, 0x53, 0x5c, 0x92, 0x3b, 0x94, 0xc3, 0xe, 0x79, 0x4d, 0x1e, 0x79, 0x74, 0x75, 0xd7, 0xb6, 0xee, 0xaf, 0x3f}, + {0xea, 0xa8, 0xd4, 0xf7, 0xbe, 0x1a, 0x39, 0x21, 0x5c, 0xf4, 0x7e, 0x9, 0x4c, 0x23, 0x27, 0x51, 0x26, 0xa3, 0x24, 0x53, 0xba, 0x32, 0x3c, 0xd2, 0x44, 0xa3, 0x17, 0x4a, 0x6d, 0xa6, 0xd5, 0xad}, + {0xb5, 0x1d, 0x3e, 0xa6, 0xaf, 0xf2, 0xc9, 0x8, 0x83, 0x59, 0x3d, 0x98, 0x91, 0x6b, 0x3c, 0x56, 0x4c, 0xf8, 0x7c, 0xa1, 0x72, 0x86, 0x60, 0x4d, 0x46, 0xe2, 0x3e, 0xcc, 0x8, 0x6e, 0xc7, 0xf6}, + {0x2f, 0x98, 0x33, 0xb3, 0xb1, 0xbc, 0x76, 0x5e, 0x2b, 0xd6, 0x66, 0xa5, 0xef, 0xc4, 0xe6, 0x2a, 0x6, 0xf4, 0xb6, 0xe8, 0xbe, 0xc1, 0xd4, 0x36, 0x74, 0xee, 0x82, 0x15, 0xbc, 0xef, 0x21, 0x63}, + {0xfd, 0xc1, 0x4e, 0xd, 0xf4, 0x53, 0xc9, 0x69, 0xa7, 0x7d, 0x5a, 0xc4, 0x6, 0x58, 0x58, 0x26, 0x7e, 0xc1, 0x14, 0x16, 0x6, 0xe0, 0xfa, 0x16, 0x7e, 0x90, 0xaf, 0x3d, 0x28, 0x63, 0x9d, 0x3f}, + {0xd2, 0xc9, 0xf2, 0xe3, 0x0, 0x9b, 0xd2, 0xc, 0x5f, 0xaa, 0xce, 0x30, 0xb7, 0xd4, 0xc, 0x30, 0x74, 0x2a, 0x51, 0x16, 0xf2, 0xe0, 0x32, 0x98, 0xd, 0xeb, 0x30, 0xd8, 0xe3, 0xce, 0xf8, 0x9a}, + {0x4b, 0xc5, 0x9e, 0x7b, 0xb5, 0xf1, 0x79, 0x92, 0xff, 0x51, 0xe6, 0x6e, 0x4, 0x86, 0x68, 0xd3, 0x9b, 0x23, 0x4d, 0x57, 0xe6, 0x96, 0x67, 0x31, 0xcc, 0xe6, 0xa6, 0xf3, 0x17, 0xa, 0x75, 0x5}, + {0xb1, 0x76, 0x81, 0xd9, 0x13, 0x32, 0x6c, 0xce, 0x3c, 0x17, 0x52, 0x84, 0xf8, 0x5, 0xa2, 0x62, 0xf4, 0x2b, 0xcb, 0xb3, 0x78, 0x47, 0x15, 0x47, 0xff, 0x46, 0x54, 0x82, 0x23, 0x93, 0x6a, 0x48}, + {0x38, 0xdf, 0x58, 0x7, 0x4e, 0x5e, 0x65, 0x65, 0xf2, 0xfc, 0x7c, 0x89, 0xfc, 0x86, 0x50, 0x8e, 0x31, 0x70, 0x2e, 0x44, 0xd0, 0xb, 0xca, 0x86, 0xf0, 0x40, 0x9, 0xa2, 0x30, 0x78, 0x47, 0x4e}, + {0x65, 0xa0, 0xee, 0x39, 0xd1, 0xf7, 0x38, 0x83, 0xf7, 0x5e, 0xe9, 0x37, 0xe4, 0x2c, 0x3a, 0xbd, 0x21, 0x97, 0xb2, 0x26, 0x1, 0x13, 0xf8, 0x6f, 0xa3, 0x44, 0xed, 0xd1, 0xef, 0x9f, 0xde, 0xe7}, + {0x8b, 0xa0, 0xdf, 0x15, 0x76, 0x25, 0x92, 0xd9, 0x3c, 0x85, 0xf7, 0xf6, 0x12, 0xdc, 0x42, 0xbe, 0xd8, 0xa7, 0xec, 0x7c, 0xab, 0x27, 0xb0, 0x7e, 0x53, 0x8d, 0x7d, 0xda, 0xaa, 0x3e, 0xa8, 0xde}, + {0xaa, 0x25, 0xce, 0x93, 0xbd, 0x2, 0x69, 0xd8, 0x5a, 0xf6, 0x43, 0xfd, 0x1a, 0x73, 0x8, 0xf9, 0xc0, 0x5f, 0xef, 0xda, 0x17, 0x4a, 0x19, 0xa5, 0x97, 0x4d, 0x66, 0x33, 0x4c, 0xfd, 0x21, 0x6a}, + {0x35, 0xb4, 0x98, 0x31, 0xdb, 0x41, 0x15, 0x70, 0xea, 0x1e, 0xf, 0xbb, 0xed, 0xcd, 0x54, 0x9b, 0x9a, 0xd0, 0x63, 0xa1, 0x51, 0x97, 0x40, 0x72, 0xf6, 0x75, 0x9d, 0xbf, 0x91, 0x47, 0x6f, 0xe2}}; + +#define JH_SWAP1(x) (x) = ((((x)&0x5555555555555555ULL) << 1) | (((x)&0xaaaaaaaaaaaaaaaaULL) >> 1)); +#define JH_SWAP2(x) (x) = ((((x)&0x3333333333333333ULL) << 2) | (((x)&0xccccccccccccccccULL) >> 2)); +#define JH_SWAP4(x) (x) = ((((x)&0x0f0f0f0f0f0f0f0fULL) << 4) | (((x)&0xf0f0f0f0f0f0f0f0ULL) >> 4)); +#define JH_SWAP8(x) (x) = ((((x)&0x00ff00ff00ff00ffULL) << 8) | (((x)&0xff00ff00ff00ff00ULL) >> 8)); +#define JH_SWAP16(x) (x) = ((((x)&0x0000ffff0000ffffULL) << 16) | (((x)&0xffff0000ffff0000ULL) >> 16)); +#define JH_SWAP32(x) (x) = (((x) << 32) | ((x) >> 32)); + +#define JH_L(m0, m1, m2, m3, m4, m5, m6, m7) \ + (m4) ^= (m1); \ + (m5) ^= (m2); \ + (m6) ^= (m0) ^ (m3); \ + (m7) ^= (m0); \ + (m0) ^= (m5); \ + (m1) ^= (m6); \ + (m2) ^= (m4) ^ (m7); \ (m3) ^= (m4); -#define JH_SS(m0,m1,m2,m3,m4,m5,m6,m7,cc0,cc1) \ - m3 = ~(m3); \ - m7 = ~(m7); \ - m0 ^= ((~(m2)) & (cc0)); \ - m4 ^= ((~(m6)) & (cc1)); \ - temp0 = (cc0) ^ ((m0) & (m1));\ - temp1 = (cc1) ^ ((m4) & (m5));\ - m0 ^= ((m2) & (m3)); \ - m4 ^= ((m6) & (m7)); \ - m3 ^= ((~(m1)) & (m2)); \ - m7 ^= ((~(m5)) & (m6)); \ - m1 ^= ((m0) & (m2)); \ - m5 ^= ((m4) & (m6)); \ - m2 ^= ((m0) & (~(m3))); \ - m6 ^= ((m4) & (~(m7))); \ - m0 ^= ((m1) | (m3)); \ - m4 ^= ((m5) | (m7)); \ - m3 ^= ((m1) & (m2)); \ - m7 ^= ((m5) & (m6)); \ - m1 ^= (temp0 & (m0)); \ - m5 ^= (temp1 & (m4)); \ - m2 ^= temp0; \ +#define JH_SS(m0, m1, m2, m3, m4, m5, m6, m7, cc0, cc1) \ + m3 = ~(m3); \ + m7 = ~(m7); \ + m0 ^= ((~(m2)) & (cc0)); \ + m4 ^= ((~(m6)) & (cc1)); \ + temp0 = (cc0) ^ ((m0) & (m1)); \ + temp1 = (cc1) ^ ((m4) & (m5)); \ + m0 ^= ((m2) & (m3)); \ + m4 ^= ((m6) & (m7)); \ + m3 ^= ((~(m1)) & (m2)); \ + m7 ^= ((~(m5)) & (m6)); \ + m1 ^= ((m0) & (m2)); \ + m5 ^= ((m4) & (m6)); \ + m2 ^= ((m0) & (~(m3))); \ + m6 ^= ((m4) & (~(m7))); \ + m0 ^= ((m1) | (m3)); \ + m4 ^= ((m5) | (m7)); \ + m3 ^= ((m1) & (m2)); \ + m7 ^= ((m5) & (m6)); \ + m1 ^= (temp0 & (m0)); \ + m5 ^= (temp1 & (m4)); \ + m2 ^= temp0; \ m6 ^= temp1; -__device__ void cn_jh_E8(jhHashState *state) +__device__ void cn_jh_E8(jhHashState* state) { - uint64_t i,roundnumber,temp0,temp1; + uint64_t i, roundnumber, temp0, temp1; - for (roundnumber = 0; roundnumber < 42; roundnumber = roundnumber+7) + for(roundnumber = 0; roundnumber < 42; roundnumber = roundnumber + 7) { - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+0])[i],((uint64_t *)d_E8_rc[roundnumber+0])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - JH_SWAP1(state->x[1][i]); JH_SWAP1(state->x[3][i]); JH_SWAP1(state->x[5][i]); JH_SWAP1(state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 0])[i], ((uint64_t*)d_E8_rc[roundnumber + 0])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + JH_SWAP1(state->x[1][i]); + JH_SWAP1(state->x[3][i]); + JH_SWAP1(state->x[5][i]); + JH_SWAP1(state->x[7][i]); } - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+1])[i],((uint64_t *)d_E8_rc[roundnumber+1])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - JH_SWAP2(state->x[1][i]); JH_SWAP2(state->x[3][i]); JH_SWAP2(state->x[5][i]); JH_SWAP2(state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 1])[i], ((uint64_t*)d_E8_rc[roundnumber + 1])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + JH_SWAP2(state->x[1][i]); + JH_SWAP2(state->x[3][i]); + JH_SWAP2(state->x[5][i]); + JH_SWAP2(state->x[7][i]); } - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+2])[i],((uint64_t *)d_E8_rc[roundnumber+2])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - JH_SWAP4(state->x[1][i]); JH_SWAP4(state->x[3][i]); JH_SWAP4(state->x[5][i]); JH_SWAP4(state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 2])[i], ((uint64_t*)d_E8_rc[roundnumber + 2])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + JH_SWAP4(state->x[1][i]); + JH_SWAP4(state->x[3][i]); + JH_SWAP4(state->x[5][i]); + JH_SWAP4(state->x[7][i]); } - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+3])[i],((uint64_t *)d_E8_rc[roundnumber+3])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - JH_SWAP8(state->x[1][i]); JH_SWAP8(state->x[3][i]); JH_SWAP8(state->x[5][i]); JH_SWAP8(state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 3])[i], ((uint64_t*)d_E8_rc[roundnumber + 3])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + JH_SWAP8(state->x[1][i]); + JH_SWAP8(state->x[3][i]); + JH_SWAP8(state->x[5][i]); + JH_SWAP8(state->x[7][i]); } - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+4])[i],((uint64_t *)d_E8_rc[roundnumber+4])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - JH_SWAP16(state->x[1][i]); JH_SWAP16(state->x[3][i]); JH_SWAP16(state->x[5][i]); JH_SWAP16(state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 4])[i], ((uint64_t*)d_E8_rc[roundnumber + 4])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + JH_SWAP16(state->x[1][i]); + JH_SWAP16(state->x[3][i]); + JH_SWAP16(state->x[5][i]); + JH_SWAP16(state->x[7][i]); } - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+5])[i],((uint64_t *)d_E8_rc[roundnumber+5])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); - JH_SWAP32(state->x[1][i]); JH_SWAP32(state->x[3][i]); JH_SWAP32(state->x[5][i]); JH_SWAP32(state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 5])[i], ((uint64_t*)d_E8_rc[roundnumber + 5])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); + JH_SWAP32(state->x[1][i]); + JH_SWAP32(state->x[3][i]); + JH_SWAP32(state->x[5][i]); + JH_SWAP32(state->x[7][i]); } - for (i = 0; i < 2; i++) + for(i = 0; i < 2; i++) { - JH_SS(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i],((uint64_t *)d_E8_rc[roundnumber+6])[i],((uint64_t *)d_E8_rc[roundnumber+6])[i+2] ); - JH_L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); + JH_SS(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i], ((uint64_t*)d_E8_rc[roundnumber + 6])[i], ((uint64_t*)d_E8_rc[roundnumber + 6])[i + 2]); + JH_L(state->x[0][i], state->x[2][i], state->x[4][i], state->x[6][i], state->x[1][i], state->x[3][i], state->x[5][i], state->x[7][i]); } - for (i = 1; i < 8; i = i+2) + for(i = 1; i < 8; i = i + 2) { temp0 = state->x[i][0]; state->x[i][0] = state->x[i][1]; @@ -170,75 +187,75 @@ __device__ void cn_jh_E8(jhHashState *state) } } -__device__ void cn_jh_F8(jhHashState *state) +__device__ void cn_jh_F8(jhHashState* state) { uint64_t i; - for (i = 0; i < 8; i++) - state->x[i >> 1][i & 1] ^= ((uint64_t *)state->buffer)[i]; + for(i = 0; i < 8; i++) + state->x[i >> 1][i & 1] ^= ((uint64_t*)state->buffer)[i]; cn_jh_E8(state); - for (i = 0; i < 8; i++) - state->x[(8+i) >> 1][(8+i) & 1] ^= ((uint64_t *)state->buffer)[i]; + for(i = 0; i < 8; i++) + state->x[(8 + i) >> 1][(8 + i) & 1] ^= ((uint64_t*)state->buffer)[i]; } -__device__ void cn_jh_update(jhHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen) +__device__ void cn_jh_update(jhHashState* __restrict__ state, const BitSequence* __restrict__ data, DataLength databitlen) { DataLength index; state->databitlen += databitlen; index = 0; - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) < 512) ) + if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) < 512)) { - if ( (databitlen & 7) == 0 ) - memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)); + if((databitlen & 7) == 0) + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3)); else - memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3)+1); + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3) + 1); state->datasize_in_buffer += databitlen; databitlen = 0; } - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) + if((state->datasize_in_buffer > 0) && ((state->datasize_in_buffer + databitlen) >= 512)) { - memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ); - index = 64-(state->datasize_in_buffer >> 3); + memcpy(state->buffer + (state->datasize_in_buffer >> 3), data, 64 - (state->datasize_in_buffer >> 3)); + index = 64 - (state->datasize_in_buffer >> 3); databitlen = databitlen - (512 - state->datasize_in_buffer); cn_jh_F8(state); state->datasize_in_buffer = 0; } - for ( ; databitlen >= 512; index = index+64, databitlen = databitlen - 512) + for(; databitlen >= 512; index = index + 64, databitlen = databitlen - 512) { - memcpy(state->buffer, data+index, 64); + memcpy(state->buffer, data + index, 64); cn_jh_F8(state); } - if ( databitlen > 0) + if(databitlen > 0) { - if ((databitlen & 7) == 0) - memcpy(state->buffer, data+index, (databitlen & 0x1ff) >> 3); + if((databitlen & 7) == 0) + memcpy(state->buffer, data + index, (databitlen & 0x1ff) >> 3); else - memcpy(state->buffer, data+index, ((databitlen & 0x1ff) >> 3)+1); + memcpy(state->buffer, data + index, ((databitlen & 0x1ff) >> 3) + 1); state->datasize_in_buffer = databitlen; } } /*pad the message, process the padded block(s), truncate the hash value H to obtain the message digest*/ -__device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __restrict__ hashval) +__device__ void cn_jh_final(jhHashState* __restrict__ state, BitSequence* __restrict__ hashval) { unsigned int i; //uint32_t *bufptr = (uint32_t *)state->buffer; - if ( (state->databitlen & 0x1ff) == 0 ) + if((state->databitlen & 0x1ff) == 0) { /*pad the message when databitlen is multiple of 512 bits, then process the padded block*/ memset(state->buffer, 0, 64); //for( i = 0; i < 16; i++ ) *(bufptr+i) = 0x00000000; - state->buffer[0] = 0x80; + state->buffer[0] = 0x80; state->buffer[63] = state->databitlen & 0xff; - state->buffer[62] = (state->databitlen >> 8) & 0xff; + state->buffer[62] = (state->databitlen >> 8) & 0xff; state->buffer[61] = (state->databitlen >> 16) & 0xff; state->buffer[60] = (state->databitlen >> 24) & 0xff; state->buffer[59] = (state->databitlen >> 32) & 0xff; @@ -250,19 +267,19 @@ __device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __re else { /*set the rest of the bytes in the buffer to 0*/ - if ( (state->datasize_in_buffer & 7) == 0) + if((state->datasize_in_buffer & 7) == 0) { - for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) + for(i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0; } else { - for (i = ((state->databitlen & 0x1ff) >> 3)+1; i < 64; i++) + for(i = ((state->databitlen & 0x1ff) >> 3) + 1; i < 64; i++) state->buffer[i] = 0; } /*pad and process the partial block when databitlen is not multiple of 512 bits, then hash the padded blocks*/ - state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7- (state->databitlen & 7)); + state->buffer[((state->databitlen & 0x1ff) >> 3)] |= 1 << (7 - (state->databitlen & 7)); cn_jh_F8(state); memset(state->buffer, 0, 64); @@ -278,10 +295,10 @@ __device__ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __re cn_jh_F8(state); } - memcpy(hashval,(unsigned char*)state->x+64+32,32); + memcpy(hashval, (unsigned char*)state->x + 64 + 32, 32); } -__device__ void cn_jh_init(jhHashState *state, int hashbitlen) +__device__ void cn_jh_init(jhHashState* state, int hashbitlen) { state->databitlen = 0; state->datasize_in_buffer = 0; @@ -289,7 +306,7 @@ __device__ void cn_jh_init(jhHashState *state, int hashbitlen) memcpy(state->x, d_JH256_H0, 128); } -__device__ void cn_jh(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +__device__ void cn_jh(const BitSequence* __restrict__ data, DataLength len, BitSequence* __restrict__ hashval) { int hashbitlen = 256; DataLength databitlen = len << 3; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp index 3f535631d..0fe277bd5 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp @@ -7,42 +7,49 @@ __constant__ #else const #endif -uint64_t keccakf_rndc[24] ={ - 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, - 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, - 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, - 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, - 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, - 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, - 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, - 0x8000000000008080, 0x0000000080000001, 0x8000000080008008 -}; + uint64_t keccakf_rndc[24] = { + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008}; #if __CUDA_ARCH__ >= 350 - __forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset) +__forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset) +{ + uint2 result; + if(offset >= 32) + { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } + else { - uint2 result; - if(offset >= 32) - { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - } - else - { - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); - } - return __double_as_longlong(__hiloint2double(result.y, result.x)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.x) + : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" + : "=r"(result.y) + : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); } - #define rotl64_1(x, y) (cuda_rotl64((x), (y))) + return __double_as_longlong(__hiloint2double(result.y, result.x)); +} +#define rotl64_1(x, y) (cuda_rotl64((x), (y))) #else - #define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y)))) +#define rotl64_1(x, y) ((x) << (y) | ((x) >> (64 - (y)))) #endif #define rotl64_2(x, y) rotl64_1(((x) >> 32) | ((x) << 32), (y)) #define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a)))) -__device__ __forceinline__ void cn_keccakf2(uint64_t *s) +__device__ __forceinline__ void cn_keccakf2(uint64_t* s) { uint8_t i; @@ -90,16 +97,46 @@ __device__ __forceinline__ void cn_keccakf2(uint64_t *s) s[7] = rotl64_1(s[10] ^ bc[4], 3); s[10] = rotl64_1(tmp1, 1); - tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); - tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); - tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); - tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); - tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + tmp1 = s[0]; + tmp2 = s[1]; + s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); + s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); + s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); + s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); + s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; + tmp2 = s[6]; + s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); + s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); + s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); + s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); + s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; + tmp2 = s[11]; + s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); + s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); + s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); + s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); + s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; + tmp2 = s[16]; + s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); + s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); + s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); + s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); + s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; + tmp2 = s[21]; + s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); + s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); + s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); + s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); + s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); s[0] ^= keccakf_rndc[i]; } } -__device__ __forceinline__ void cn_keccakf(uint64_t *s) +__device__ __forceinline__ void cn_keccakf(uint64_t* s) { uint64_t bc[5], tmpxor[5], tmp1, tmp2; @@ -145,16 +182,46 @@ __device__ __forceinline__ void cn_keccakf(uint64_t *s) s[7] = rotl64_1(s[10] ^ bc[4], 3); s[10] = rotl64_1(tmp1, 1); - tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); - tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); - tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); - tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); - tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + tmp1 = s[0]; + tmp2 = s[1]; + s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); + s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); + s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); + s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); + s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; + tmp2 = s[6]; + s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); + s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); + s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); + s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); + s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; + tmp2 = s[11]; + s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); + s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); + s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); + s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); + s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; + tmp2 = s[16]; + s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); + s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); + s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); + s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); + s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; + tmp2 = s[21]; + s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); + s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); + s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); + s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); + s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); s[0] ^= keccakf_rndc[i]; } } -__device__ __forceinline__ void cn_keccak(const uint8_t * __restrict__ in, uint32_t len, uint8_t * __restrict__ md) +__device__ __forceinline__ void cn_keccak(const uint8_t* __restrict__ in, uint32_t len, uint8_t* __restrict__ md) { uint64_t st[25]; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp index fc45db1ae..b8073f03b 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_skein.hpp @@ -1,124 +1,146 @@ #pragma once -typedef unsigned int uint_t; /* native unsigned integer */ +typedef unsigned int uint_t; /* native unsigned integer */ -#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */ +#define SKEIN_MODIFIER_WORDS (2) /* number of modifier (tweak) words */ -#define SKEIN_256_STATE_WORDS ( 4) -#define SKEIN_512_STATE_WORDS ( 8) +#define SKEIN_256_STATE_WORDS (4) +#define SKEIN_512_STATE_WORDS (8) #define SKEIN1024_STATE_WORDS (16) -#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS) -#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) -#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS) +#define SKEIN_256_STATE_BYTES (8 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BYTES (8 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BYTES (8 * SKEIN1024_STATE_WORDS) -#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS) -#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) -#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS) +#define SKEIN_256_STATE_BITS (64 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BITS (64 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BITS (64 * SKEIN1024_STATE_WORDS) -#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS) -#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) -#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS) +#define SKEIN_256_BLOCK_BYTES (8 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_BLOCK_BYTES (8 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_BLOCK_BYTES (8 * SKEIN1024_STATE_WORDS) -#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32)) -#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) +#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((uint64_t)(hi32)) << 32)) +#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22) -#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */ +#define SKEIN_T1_BIT(BIT) ((BIT)-64) /* offset 64 because it's the second word */ -#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ -#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */ -#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ -#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ +#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ +#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */ +#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ +#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ -#define SKEIN_T1_FLAG_FIRST (((uint64_t) 1 ) << SKEIN_T1_POS_FIRST) -#define SKEIN_T1_FLAG_BIT_PAD (((uint64_t) 1 ) << SKEIN_T1_POS_BIT_PAD) -#define SKEIN_T1_FLAG_FINAL (((uint64_t) 1 ) << SKEIN_T1_POS_FINAL) +#define SKEIN_T1_FLAG_FIRST (((uint64_t)1) << SKEIN_T1_POS_FIRST) +#define SKEIN_T1_FLAG_BIT_PAD (((uint64_t)1) << SKEIN_T1_POS_BIT_PAD) +#define SKEIN_T1_FLAG_FINAL (((uint64_t)1) << SKEIN_T1_POS_FINAL) -#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ -#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ +#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ +#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ -#define SKEIN_T1_BLK_TYPE(T) (((uint64_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) +#define SKEIN_T1_BLK_TYPE(T) (((uint64_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) -#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ -#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ +#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ +#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ -#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) +#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) -#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);} - -#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0) -#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1) - -#define Skein_Set_T0_T1(ctxPtr,T0,T1) { \ - Skein_Set_T0(ctxPtr,(T0)); \ - Skein_Set_T1(ctxPtr,(T1)); } - -#define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \ -{ Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; } - -#define Skein_Set_Bit_Pad_Flag(hdr) { (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; } +#define Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal) \ + { \ + (ctxPtr)->h.T[TWK_NUM] = (tVal); \ + } -#define KW_TWK_BASE (0) -#define KW_KEY_BASE (3) -#define ks (kw + KW_KEY_BASE) -#define ts (kw + KW_TWK_BASE) +#define Skein_Set_T0(ctxPtr, T0) Skein_Set_Tweak(ctxPtr, 0, T0) +#define Skein_Set_T1(ctxPtr, T1) Skein_Set_Tweak(ctxPtr, 1, T1) -#define R512(p0,p1,p2,p3,p4,p5,p6,p7,R512ROT,rNum) \ - X##p0 += X##p1; X##p1 = ROTL64(X##p1,R512ROT##_0); X##p1 ^= X##p0; \ - X##p2 += X##p3; X##p3 = ROTL64(X##p3,R512ROT##_1); X##p3 ^= X##p2; \ - X##p4 += X##p5; X##p5 = ROTL64(X##p5,R512ROT##_2); X##p5 ^= X##p4; \ - X##p6 += X##p7; X##p7 = ROTL64(X##p7,R512ROT##_3); X##p7 ^= X##p6; +#define Skein_Set_T0_T1(ctxPtr, T0, T1) \ + { \ + Skein_Set_T0(ctxPtr, (T0)); \ + Skein_Set_T1(ctxPtr, (T1)); \ + } -#define I512(R) \ - X0 += ks[((R)+1) % 9]; \ - X1 += ks[((R)+2) % 9]; \ - X2 += ks[((R)+3) % 9]; \ - X3 += ks[((R)+4) % 9]; \ - X4 += ks[((R)+5) % 9]; \ - X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \ - X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \ - X7 += ks[((R)+8) % 9] + (R)+1; +#define Skein_Start_New_Type(ctxPtr, BLK_TYPE) \ + { \ + Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); \ + (ctxPtr)->h.bCnt = 0; \ + } +#define Skein_Set_Bit_Pad_Flag(hdr) \ + { \ + (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; \ + } -#define R512_8_rounds(R) \ - R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ - R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ - R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ - R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ - I512(2*(R)); \ - R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ - R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ - R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ - R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ - I512(2*(R)+1); +#define KW_TWK_BASE (0) +#define KW_KEY_BASE (3) +#define ks (kw + KW_KEY_BASE) +#define ts (kw + KW_TWK_BASE) + +#define R512(p0, p1, p2, p3, p4, p5, p6, p7, R512ROT, rNum) \ + X##p0 += X##p1; \ + X##p1 = ROTL64(X##p1, R512ROT##_0); \ + X##p1 ^= X##p0; \ + X##p2 += X##p3; \ + X##p3 = ROTL64(X##p3, R512ROT##_1); \ + X##p3 ^= X##p2; \ + X##p4 += X##p5; \ + X##p5 = ROTL64(X##p5, R512ROT##_2); \ + X##p5 ^= X##p4; \ + X##p6 += X##p7; \ + X##p7 = ROTL64(X##p7, R512ROT##_3); \ + X##p7 ^= X##p6; + +#define I512(R) \ + X0 += ks[((R) + 1) % 9]; \ + X1 += ks[((R) + 2) % 9]; \ + X2 += ks[((R) + 3) % 9]; \ + X3 += ks[((R) + 4) % 9]; \ + X4 += ks[((R) + 5) % 9]; \ + X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \ + X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \ + X7 += ks[((R) + 8) % 9] + (R) + 1; + +#define R512_8_rounds(R) \ + R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \ + R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \ + R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \ + R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \ + I512(2 * (R)); \ + R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \ + R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \ + R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \ + R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \ + I512(2 * (R) + 1); typedef struct { - size_t hashBitLen; - size_t bCnt; - uint64_t T[SKEIN_MODIFIER_WORDS]; + size_t hashBitLen; + size_t bCnt; + uint64_t T[SKEIN_MODIFIER_WORDS]; } Skein_Ctxt_Hdr_t; -typedef struct { +typedef struct +{ Skein_Ctxt_Hdr_t h; - uint64_t X[SKEIN_256_STATE_WORDS]; - uint8_t b[SKEIN_256_BLOCK_BYTES]; + uint64_t X[SKEIN_256_STATE_WORDS]; + uint8_t b[SKEIN_256_BLOCK_BYTES]; } Skein_256_Ctxt_t; -typedef struct { +typedef struct +{ Skein_Ctxt_Hdr_t h; - uint64_t X[SKEIN_512_STATE_WORDS]; - uint8_t b[SKEIN_512_BLOCK_BYTES]; + uint64_t X[SKEIN_512_STATE_WORDS]; + uint8_t b[SKEIN_512_BLOCK_BYTES]; } Skein_512_Ctxt_t; -typedef struct { +typedef struct +{ Skein_Ctxt_Hdr_t h; - uint64_t X[SKEIN1024_STATE_WORDS]; - uint8_t b[SKEIN1024_BLOCK_BYTES]; + uint64_t X[SKEIN1024_STATE_WORDS]; + uint8_t b[SKEIN1024_BLOCK_BYTES]; } Skein1024_Ctxt_t; -typedef struct { - uint_t statebits; +typedef struct +{ + uint_t statebits; union { Skein_Ctxt_Hdr_t h; Skein_256_Ctxt_t ctx_256; @@ -127,21 +149,20 @@ typedef struct { } u; } skeinHashState; -__device__ void cn_skein_init(skeinHashState *state, size_t hashBitLen) +__device__ void cn_skein_init(skeinHashState* state, size_t hashBitLen) { const uint64_t SKEIN_512_IV_256[] = - { - SKEIN_MK_64(0xCCD044A1,0x2FDB3E13), - SKEIN_MK_64(0xE8359030,0x1A79A9EB), - SKEIN_MK_64(0x55AEA061,0x4F816E6F), - SKEIN_MK_64(0x2A2767A4,0xAE9B94DB), - SKEIN_MK_64(0xEC06025E,0x74DD7683), - SKEIN_MK_64(0xE7A436CD,0xC4746251), - SKEIN_MK_64(0xC36FBAF9,0x393AD185), - SKEIN_MK_64(0x3EEDBA18,0x33EDFC13) - }; + { + SKEIN_MK_64(0xCCD044A1, 0x2FDB3E13), + SKEIN_MK_64(0xE8359030, 0x1A79A9EB), + SKEIN_MK_64(0x55AEA061, 0x4F816E6F), + SKEIN_MK_64(0x2A2767A4, 0xAE9B94DB), + SKEIN_MK_64(0xEC06025E, 0x74DD7683), + SKEIN_MK_64(0xE7A436CD, 0xC4746251), + SKEIN_MK_64(0xC36FBAF9, 0x393AD185), + SKEIN_MK_64(0x3EEDBA18, 0x33EDFC13)}; - Skein_512_Ctxt_t *ctx = &state->u.ctx_512; + Skein_512_Ctxt_t* ctx = &state->u.ctx_512; ctx->h.hashBitLen = hashBitLen; @@ -150,22 +171,47 @@ __device__ void cn_skein_init(skeinHashState *state, size_t hashBitLen) Skein_Start_New_Type(ctx, MSG); } -__device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd) +__device__ void cn_skein512_processblock(Skein_512_Ctxt_t* __restrict__ ctx, const uint8_t* __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd) { - enum { - R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37, - R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42, - R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39, - R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56, - R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24, - R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17, - R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43, - R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22 + enum + { + R_512_0_0 = 46, + R_512_0_1 = 36, + R_512_0_2 = 19, + R_512_0_3 = 37, + R_512_1_0 = 33, + R_512_1_1 = 27, + R_512_1_2 = 14, + R_512_1_3 = 42, + R_512_2_0 = 17, + R_512_2_1 = 49, + R_512_2_2 = 36, + R_512_2_3 = 39, + R_512_3_0 = 44, + R_512_3_1 = 9, + R_512_3_2 = 54, + R_512_3_3 = 56, + R_512_4_0 = 39, + R_512_4_1 = 30, + R_512_4_2 = 34, + R_512_4_3 = 24, + R_512_5_0 = 13, + R_512_5_1 = 50, + R_512_5_2 = 10, + R_512_5_3 = 17, + R_512_6_0 = 25, + R_512_6_1 = 29, + R_512_6_2 = 39, + R_512_6_3 = 43, + R_512_7_0 = 8, + R_512_7_1 = 35, + R_512_7_2 = 56, + R_512_7_3 = 22 }; - uint64_t X0,X1,X2,X3,X4,X5,X6,X7; + uint64_t X0, X1, X2, X3, X4, X5, X6, X7; uint64_t w[SKEIN_512_STATE_WORDS]; - uint64_t kw[SKEIN_512_STATE_WORDS+4]; + uint64_t kw[SKEIN_512_STATE_WORDS + 4]; ts[0] = ctx->h.T[0]; ts[1] = ctx->h.T[1]; @@ -184,7 +230,7 @@ __device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, co ks[6] = ctx->X[6]; ks[7] = ctx->X[7]; ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ - ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; ts[2] = ts[0] ^ ts[1]; @@ -201,15 +247,15 @@ __device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, co blkPtr += SKEIN_512_BLOCK_BYTES; - R512_8_rounds( 0); - R512_8_rounds( 1); - R512_8_rounds( 2); - R512_8_rounds( 3); - R512_8_rounds( 4); - R512_8_rounds( 5); - R512_8_rounds( 6); - R512_8_rounds( 7); - R512_8_rounds( 8); + R512_8_rounds(0); + R512_8_rounds(1); + R512_8_rounds(2); + R512_8_rounds(3); + R512_8_rounds(4); + R512_8_rounds(5); + R512_8_rounds(6); + R512_8_rounds(7); + R512_8_rounds(8); ctx->X[0] = X0 ^ w[0]; ctx->X[1] = X1 ^ w[1]; @@ -221,125 +267,124 @@ __device__ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, co ctx->X[7] = X7 ^ w[7]; ts[1] &= ~SKEIN_T1_FLAG_FIRST; - } - while (--blkCnt); + } while(--blkCnt); ctx->h.T[0] = ts[0]; ctx->h.T[1] = ts[1]; } -__device__ void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal) +__device__ void cn_skein_final(skeinHashState* __restrict__ state, uint8_t* __restrict__ hashVal) { - size_t i,n,byteCnt; + size_t i, n, byteCnt; uint64_t X[SKEIN_512_STATE_WORDS]; - Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512; + Skein_512_Ctxt_t* ctx = (Skein_512_Ctxt_t*)&state->u.ctx_512; //size_t tmp; //uint8_t *p8; //uint64_t *p64; ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; - if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) + if(ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) { - memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); //p8 = &ctx->b[ctx->h.bCnt]; //tmp = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; //for( i = 0; i < tmp; i++ ) *(p8+i) = 0; } - cn_skein512_processblock(ctx,ctx->b,1,ctx->h.bCnt); + cn_skein512_processblock(ctx, ctx->b, 1, ctx->h.bCnt); byteCnt = (ctx->h.hashBitLen + 7) >> 3; //uint8_t b[SKEIN_512_BLOCK_BYTES] == 64 - memset(ctx->b,0,sizeof(ctx->b)); + memset(ctx->b, 0, sizeof(ctx->b)); //p64 = (uint64_t *)ctx->b; //for( i = 0; i < 8; i++ ) *(p64+i) = 0; - memcpy(X,ctx->X,sizeof(X)); + memcpy(X, ctx->X, sizeof(X)); - for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) + for(i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) { - ((uint64_t *)ctx->b)[0]= (uint64_t)i; - Skein_Start_New_Type(ctx,OUT_FINAL); - cn_skein512_processblock(ctx,ctx->b,1,sizeof(uint64_t)); - n = byteCnt - i*SKEIN_512_BLOCK_BYTES; - if (n >= SKEIN_512_BLOCK_BYTES) - n = SKEIN_512_BLOCK_BYTES; - memcpy(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); - memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + ((uint64_t*)ctx->b)[0] = (uint64_t)i; + Skein_Start_New_Type(ctx, OUT_FINAL); + cn_skein512_processblock(ctx, ctx->b, 1, sizeof(uint64_t)); + n = byteCnt - i * SKEIN_512_BLOCK_BYTES; + if(n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + memcpy(hashVal + i * SKEIN_512_BLOCK_BYTES, ctx->X, n); + memcpy(ctx->X, X, sizeof(X)); /* restore the counter mode key for next time */ } } -__device__ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt) +__device__ void cn_skein512_update(Skein_512_Ctxt_t* __restrict__ ctx, const uint8_t* __restrict__ msg, size_t msgByteCnt) { size_t n; - if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) + if(msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) { - if (ctx->h.bCnt) + if(ctx->h.bCnt) { n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; - if (n) + if(n) { - memcpy(&ctx->b[ctx->h.bCnt],msg,n); - msgByteCnt -= n; - msg += n; + memcpy(&ctx->b[ctx->h.bCnt], msg, n); + msgByteCnt -= n; + msg += n; ctx->h.bCnt += n; } - cn_skein512_processblock(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES); + cn_skein512_processblock(ctx, ctx->b, 1, SKEIN_512_BLOCK_BYTES); ctx->h.bCnt = 0; } - if (msgByteCnt > SKEIN_512_BLOCK_BYTES) + if(msgByteCnt > SKEIN_512_BLOCK_BYTES) { - n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; - cn_skein512_processblock(ctx,msg,n,SKEIN_512_BLOCK_BYTES); + n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES; + cn_skein512_processblock(ctx, msg, n, SKEIN_512_BLOCK_BYTES); msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; - msg += n * SKEIN_512_BLOCK_BYTES; + msg += n * SKEIN_512_BLOCK_BYTES; } } - if (msgByteCnt) + if(msgByteCnt) { - memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + memcpy(&ctx->b[ctx->h.bCnt], msg, msgByteCnt); ctx->h.bCnt += msgByteCnt; } } -__device__ void cn_skein_update(skeinHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen) +__device__ void cn_skein_update(skeinHashState* __restrict__ state, const BitSequence* __restrict__ data, DataLength databitlen) { - if ((databitlen & 7) == 0) + if((databitlen & 7) == 0) { - cn_skein512_update(&state->u.ctx_512,data,databitlen >> 3); + cn_skein512_update(&state->u.ctx_512, data, databitlen >> 3); } else { size_t bCnt = (databitlen >> 3) + 1; - uint8_t b,mask; + uint8_t b, mask; - mask = (uint8_t) (1u << (7 - (databitlen & 7))); - b = (uint8_t) ((data[bCnt-1] & (0-mask)) | mask); + mask = (uint8_t)(1u << (7 - (databitlen & 7))); + b = (uint8_t)((data[bCnt - 1] & (0 - mask)) | mask); - cn_skein512_update(&state->u.ctx_512,data,bCnt-1); - cn_skein512_update(&state->u.ctx_512,&b , 1 ); + cn_skein512_update(&state->u.ctx_512, data, bCnt - 1); + cn_skein512_update(&state->u.ctx_512, &b, 1); Skein_Set_Bit_Pad_Flag(state->u.h); } } -__device__ void cn_skein(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +__device__ void cn_skein(const BitSequence* __restrict__ data, DataLength len, BitSequence* __restrict__ hashval) { int hashbitlen = 256; DataLength databitlen = len << 3; skeinHashState state; - state.statebits = 64*SKEIN_512_STATE_WORDS; + state.statebits = 64 * SKEIN_512_STATE_WORDS; cn_skein_init(&state, hashbitlen); cn_skein_update(&state, data, databitlen); diff --git a/xmrstak/backend/plugin.hpp b/xmrstak/backend/plugin.hpp index 5c7dfe16a..902a66230 100644 --- a/xmrstak/backend/plugin.hpp +++ b/xmrstak/backend/plugin.hpp @@ -3,22 +3,22 @@ #include "xmrstak/misc/environment.hpp" #include "xmrstak/params.hpp" -#include -#include -#include -#include #include "iBackend.hpp" +#include #include +#include +#include +#include #ifndef USE_PRECOMPILED_HEADERS -# ifdef WIN32 -# include -# include -# else -# include -# include -# endif -# include +#ifdef WIN32 +#include +#include +#else +#include +#include +#endif +#include #endif namespace xmrstak @@ -36,16 +36,16 @@ struct plugin libBackend = LoadLibrary(TEXT((libName + ".dll").c_str())); if(!libBackend) { - std::cerr << "WARNING: "<< m_backendName <<" cannot load backend library: " << (libName + ".dll") << std::endl; + std::cerr << "WARNING: " << m_backendName << " cannot load backend library: " << (libName + ".dll") << std::endl; return; } #else // `.so` linux file extention for dynamic libraries std::string fileExtension = ".so"; -# if defined(__APPLE__) +#if defined(__APPLE__) // `.dylib` Mac OS X file extention for dynamic libraries fileExtension = ".dylib"; -# endif +#endif // search library in working directory libBackend = dlopen(("./lib" + libName + fileExtension).c_str(), RTLD_LAZY); // fallback to binary directory @@ -56,21 +56,21 @@ struct plugin libBackend = dlopen(("lib" + libName + fileExtension).c_str(), RTLD_LAZY); if(!libBackend) { - std::cerr << "WARNING: "<< m_backendName <<" cannot load backend library: " << dlerror() << std::endl; + std::cerr << "WARNING: " << m_backendName << " cannot load backend library: " << dlerror() << std::endl; return; } #endif #ifdef WIN32 - fn_startBackend = (startBackend_t) GetProcAddress(libBackend, "xmrstak_start_backend"); - if (!fn_startBackend) + fn_startBackend = (startBackend_t)GetProcAddress(libBackend, "xmrstak_start_backend"); + if(!fn_startBackend) { - std::cerr << "WARNING: backend plugin " << libName << " contains no entry 'xmrstak_start_backend': " < +#include #include +#include #include -#include #include -#include #ifndef CONF_NO_TLS -#include #include +#include #endif #ifdef _WIN32 -# define strcasecmp _stricmp -# include -# include "xmrstak/misc/uac.hpp" +#define strcasecmp _stricmp +#include "xmrstak/misc/uac.hpp" +#include #endif // _WIN32 int do_benchmark(int block_version, int wait_sec, int work_sec); @@ -62,72 +61,76 @@ void help() using namespace std; using namespace xmrstak; - cout<<"Usage: "<> tmp; + getline(std::cin, tmp); + if(tmp.empty()) + tmp = default_value; std::transform(tmp.begin(), tmp.end(), tmp.begin(), ::tolower); - } - while(tmp != "y" && tmp != "n" && tmp != "yes" && tmp != "no"); + } while(tmp != "y" && tmp != "n" && tmp != "yes" && tmp != "no"); return tmp == "y" || tmp == "yes"; } @@ -139,34 +142,37 @@ inline const char* bool_to_str(bool v) std::string get_multipool_entry(bool& final) { - std::cout<> pool; std::string userName; - std::cout<<"- Username (wallet address or pool login):"<> userName; std::string passwd; - std::cin.clear(); std::cin.ignore(INT_MAX,'\n'); - std::cout<<"- Password (mostly empty or x):"<> pool_weight) || pool_weight <= 0) { std::cin.clear(); @@ -174,30 +180,37 @@ std::string get_multipool_entry(bool& final) std::cout << "Invalid weight. Try 1, 10, 100, etc:" << std::endl; } - final = !read_yes_no("- Do you want to add another pool? (y/n)"); + final = !read_yes_no("- Do you want to add another pool? (y/N)", "N"); - return "\t{\"pool_address\" : \"" + pool +"\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid + - "\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " + - bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n"; + return "\t{\"pool_address\" : \"" + pool + "\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid + + "\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " + + bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n"; } inline void prompt_once(bool& prompted) { if(!prompted) { - std::cout<<"Please enter:"<> tmp; } @@ -227,7 +240,7 @@ void do_guided_pool_config() prompt_once(prompted); userSetPool = false; - std::cout<<"- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl; + std::cout << "- Pool address: e.g. " << jconf::GetDefaultPool(xmrstak::params::inst().currency.c_str()) << std::endl; std::cin >> pool; } @@ -236,7 +249,7 @@ void do_guided_pool_config() { prompt_once(prompted); - std::cout<<"- Username (wallet address or pool login):"<> userName; } @@ -247,63 +260,67 @@ void do_guided_pool_config() prompt_once(prompted); // clear everything from stdin to allow an empty password - std::cin.clear(); std::cin.ignore(INT_MAX,'\n'); + std::cin.clear(); + std::cin.ignore(INT_MAX, '\n'); stdin_flushed = true; - std::cout<<"- Password (mostly empty or x):"<> pool_weight) || pool_weight <= 0) { @@ -312,13 +329,11 @@ void do_guided_pool_config() std::cout << "Invalid weight. Try 1, 10, 100, etc:" << std::endl; } } - else - pool_weight = 1; std::string pool_table; - pool_table += "\t{\"pool_address\" : \"" + pool +"\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid + - "\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " + - bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n"; + pool_table += "\t{\"pool_address\" : \"" + pool + "\", \"wallet_address\" : \"" + userName + "\", \"rig_id\" : \"" + rigid + + "\", \"pool_password\" : \"" + passwd + "\", \"use_nicehash\" : " + bool_to_str(nicehash) + ", \"use_tls\" : " + + bool_to_str(tls) + ", \"tls_fingerprint\" : \"\", \"pool_weight\" : " + std::to_string(pool_weight) + " },\n"; if(multipool) { @@ -326,14 +341,13 @@ void do_guided_pool_config() do { pool_table += get_multipool_entry(final); - } - while(!final); + } while(!final); } configTpl.replace("CURRENCY", currency); configTpl.replace("POOLCONF", pool_table); configTpl.write(params::inst().configFilePools); - std::cout<<"Pool configuration stored in file '"<> port) || port < 0 || port > 65535) - { - std::cin.clear(); - std::cin.ignore(INT_MAX, '\n'); - std::cout << "Invalid port number. Please enter a number between 0 and 65535." << std::endl; + int32_t port; + while(!(std::cin >> port) || port < 0 || port > 65535) + { + std::cin.clear(); + std::cin.ignore(INT_MAX, '\n'); + std::cout << "Invalid port number. Please enter a number between 0 and 65535." << std::endl; + } + http_port = port; } - - http_port = port; #endif } configTpl.replace("HTTP_PORT", std::to_string(http_port)); configTpl.write(params::inst().configFile); - std::cout<<"Configuration stored in file '"<=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--openCLVendor' given"); win_exit(); @@ -488,7 +503,7 @@ int main(int argc, char *argv[]) else if(opName.compare("--cpu") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--cpu' given"); win_exit(); @@ -499,7 +514,7 @@ int main(int argc, char *argv[]) else if(opName.compare("--amd") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--amd' given"); win_exit(); @@ -507,10 +522,21 @@ int main(int argc, char *argv[]) } params::inst().configFileAMD = argv[i]; } + else if(opName.compare("--amdCacheDir") == 0) + { + ++i; + if(i >= argc) + { + printer::inst()->print_msg(L0, "No argument for parameter '--amdCacheDir' given"); + win_exit(); + return 1; + } + params::inst().rootAMDCacheDir = std::string(argv[i]) + "/"; + } else if(opName.compare("--nvidia") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--nvidia' given"); win_exit(); @@ -521,7 +547,7 @@ int main(int argc, char *argv[]) else if(opName.compare("--currency") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--currency' given"); win_exit(); @@ -532,7 +558,7 @@ int main(int argc, char *argv[]) else if(opName.compare("-o") == 0 || opName.compare("--url") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-o/--url' given"); win_exit(); @@ -544,7 +570,7 @@ int main(int argc, char *argv[]) else if(opName.compare("-O") == 0 || opName.compare("--tls-url") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-O/--tls-url' given"); win_exit(); @@ -563,7 +589,7 @@ int main(int argc, char *argv[]) } ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-u/--user' given"); win_exit(); @@ -581,7 +607,7 @@ int main(int argc, char *argv[]) } ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-p/--pass' given"); win_exit(); @@ -600,7 +626,7 @@ int main(int argc, char *argv[]) } ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-r/--rigid' given"); win_exit(); @@ -617,7 +643,7 @@ int main(int argc, char *argv[]) else if(opName.compare("-c") == 0 || opName.compare("--config") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-c/--config' given"); win_exit(); @@ -628,7 +654,7 @@ int main(int argc, char *argv[]) else if(opName.compare("-C") == 0 || opName.compare("--poolconf") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-C/--poolconf' given"); win_exit(); @@ -639,7 +665,7 @@ int main(int argc, char *argv[]) else if(opName.compare("-i") == 0 || opName.compare("--httpd") == 0) { ++i; - if( i >=argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '-i/--httpd' given"); win_exit(); @@ -665,7 +691,7 @@ int main(int argc, char *argv[]) else if(opName.compare("--benchmark") == 0) { ++i; - if( i >= argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--benchmark' given"); win_exit(); @@ -684,7 +710,7 @@ int main(int argc, char *argv[]) else if(opName.compare("--benchwait") == 0) { ++i; - if( i >= argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--benchwait' given"); win_exit(); @@ -703,7 +729,7 @@ int main(int argc, char *argv[]) else if(opName.compare("--benchwork") == 0) { ++i; - if( i >= argc ) + if(i >= argc) { printer::inst()->print_msg(L0, "No argument for parameter '--benchwork' given"); win_exit(); @@ -721,17 +747,20 @@ int main(int argc, char *argv[]) } else { - printer::inst()->print_msg(L0, "Parameter unknown '%s'",argv[i]); + printer::inst()->print_msg(L0, "Parameter unknown '%s'", argv[i]); win_exit(); return 1; } } + bool hasConfigFile = configEditor::file_exist(params::inst().configFile); + bool hasPoolConfig = configEditor::file_exist(params::inst().configFilePools); + // check if we need a guided start - if(!configEditor::file_exist(params::inst().configFile)) + if(!hasConfigFile) do_guided_config(); - if(!configEditor::file_exist(params::inst().configFilePools)) + if(!hasPoolConfig) do_guided_pool_config(); if(!jconf::inst()->parse_config(params::inst().configFile.c_str(), params::inst().configFilePools.c_str())) @@ -756,7 +785,7 @@ int main(int argc, char *argv[]) if(strlen(jconf::inst()->GetOutputFile()) != 0) printer::inst()->open_logfile(jconf::inst()->GetOutputFile()); - if (!BackendConnector::self_test()) + if(!BackendConnector::self_test()) { printer::inst()->print_msg(L0, "Self test not passed!"); win_exit(); @@ -770,7 +799,7 @@ int main(int argc, char *argv[]) win_exit(); return 1; #else - if (!httpd::inst()->start_daemon()) + if(!httpd::inst()->start_daemon()) { win_exit(); return 1; @@ -845,7 +874,7 @@ int main(int argc, char *argv[]) uint64_t currentTime = get_timestamp_ms(); /* Hard guard to make sure we never get called more than twice per second */ - if( currentTime - lastTime < 500) + if(currentTime - lastTime < 500) std::this_thread::sleep_for(std::chrono::milliseconds(500 - (currentTime - lastTime))); lastTime = currentTime; } @@ -861,7 +890,7 @@ int do_benchmark(int block_version, int wait_sec, int work_sec) printer::inst()->print_msg(L0, "Prepare benchmark for block version %d", block_version); uint8_t work[128]; - memset(work,0,128); + memset(work, 0, 128); work[0] = static_cast(block_version); xmrstak::pool_data dat; @@ -869,12 +898,12 @@ int do_benchmark(int block_version, int wait_sec, int work_sec) xmrstak::miner_work oWork = xmrstak::miner_work(); pvThreads = xmrstak::BackendConnector::thread_starter(oWork); - printer::inst()->print_msg(L0, "Wait %d sec until all backends are initialized",wait_sec); + printer::inst()->print_msg(L0, "Wait %d sec until all backends are initialized", wait_sec); std::this_thread::sleep_for(std::chrono::seconds(wait_sec)); /* AMD and NVIDIA is currently only supporting work sizes up to 128byte */ - printer::inst()->print_msg(L0, "Start a %d second benchmark...",work_sec); + printer::inst()->print_msg(L0, "Start a %d second benchmark...", work_sec); xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat); uint64_t iStartStamp = get_timestamp_ms(); @@ -882,7 +911,7 @@ int do_benchmark(int block_version, int wait_sec, int work_sec) xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat); double fTotalHps = 0.0; - for (uint32_t i = 0; i < pvThreads->size(); i++) + for(uint32_t i = 0; i < pvThreads->size(); i++) { double fHps = pvThreads->at(i)->iHashCount; fHps /= (pvThreads->at(i)->iTimestamp - iStartStamp) / 1000.0; @@ -890,7 +919,7 @@ int do_benchmark(int block_version, int wait_sec, int work_sec) auto bType = static_cast(pvThreads->at(i)->backendType); std::string name(xmrstak::iBackend::getName(bType)); - printer::inst()->print_msg(L0, "Benchmark Thread %u %s: %.1f H/S", i,name.c_str(), fHps); + printer::inst()->print_msg(L0, "Benchmark Thread %u %s: %.1f H/S", i, name.c_str(), fHps); fTotalHps += fHps; } diff --git a/xmrstak/http/httpd.cpp b/xmrstak/http/httpd.cpp index ed9abc2bc..b4f0f547e 100644 --- a/xmrstak/http/httpd.cpp +++ b/xmrstak/http/httpd.cpp @@ -23,16 +23,15 @@ #ifndef CONF_NO_HTTPD - #include "httpd.hpp" #include "webdesign.hpp" -#include "xmrstak/net/msgstruct.hpp" +#include "xmrstak/jconf.hpp" #include "xmrstak/misc/console.hpp" #include "xmrstak/misc/executor.hpp" -#include "xmrstak/jconf.hpp" +#include "xmrstak/net/msgstruct.hpp" -#include #include +#include #include #include @@ -45,21 +44,20 @@ httpd* httpd::oInst = nullptr; httpd::httpd() { - } -int httpd::req_handler(void * cls, - MHD_Connection* connection, - const char* url, - const char* method, - const char* version, - const char* upload_data, - size_t* upload_data_size, - void ** ptr) +int httpd::req_handler(void* cls, + MHD_Connection* connection, + const char* url, + const char* method, + const char* version, + const char* upload_data, + size_t* upload_data_size, + void** ptr) { - struct MHD_Response * rsp; + struct MHD_Response* rsp; - if (strcmp(method, "GET") != 0) + if(strcmp(method, "GET") != 0) return MHD_NO; if(strlen(jconf::inst()->GetHttpUsername()) != 0) @@ -68,7 +66,7 @@ int httpd::req_handler(void * cls, int ret; username = MHD_digest_auth_get_username(connection); - if (username == NULL) + if(username == NULL) { rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT); ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, MHD_NO); @@ -78,7 +76,7 @@ int httpd::req_handler(void * cls, free(username); ret = MHD_digest_auth_check(connection, sHttpAuthRealm, jconf::inst()->GetHttpUsername(), jconf::inst()->GetHttpPassword(), 300); - if (ret == MHD_INVALID_NONCE || ret == MHD_NO) + if(ret == MHD_INVALID_NONCE || ret == MHD_NO) { rsp = MHD_create_response_from_buffer(sHtmlAccessDeniedSize, (void*)sHtmlAccessDenied, MHD_RESPMEM_PERSISTENT); ret = MHD_queue_auth_fail_response(connection, sHttpAuthRealm, sHttpAuthOpaque, rsp, (ret == MHD_INVALID_NONCE) ? MHD_YES : MHD_NO); @@ -174,4 +172,3 @@ bool httpd::start_daemon() } #endif - diff --git a/xmrstak/http/httpd.hpp b/xmrstak/http/httpd.hpp index fe534f038..dfad082ca 100644 --- a/xmrstak/http/httpd.hpp +++ b/xmrstak/http/httpd.hpp @@ -7,27 +7,28 @@ struct MHD_Connection; class httpd { -public: + public: static httpd* inst() { - if (oInst == nullptr) oInst = new httpd; + if(oInst == nullptr) + oInst = new httpd; return oInst; }; bool start_daemon(); -private: + private: httpd(); static httpd* oInst; - static int req_handler(void * cls, - MHD_Connection* connection, - const char* url, - const char* method, - const char* version, - const char* upload_data, - size_t* upload_data_size, - void ** ptr); + static int req_handler(void* cls, + MHD_Connection* connection, + const char* url, + const char* method, + const char* version, + const char* upload_data, + size_t* upload_data_size, + void** ptr); - MHD_Daemon *d; + MHD_Daemon* d; }; diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp index 8f20078aa..fbd565269 100644 --- a/xmrstak/http/webdesign.cpp +++ b/xmrstak/http/webdesign.cpp @@ -1,114 +1,114 @@ #include -extern const char sHtmlCssEtag [] = "00000009"; -extern const char sHtmlCssFile [] = +extern const char sHtmlCssEtag[] = "00000009"; +extern const char sHtmlCssFile[] = "body {" - "font-family: Tahoma, Arial, sans-serif;" - "font-size: 80%;" - "background-color: rgb(240, 240, 240);" + "font-family: Tahoma, Arial, sans-serif;" + "font-size: 80%;" + "background-color: rgb(240, 240, 240);" "}" "a {" - "color: rgb(44, 55, 66);" + "color: rgb(44, 55, 66);" "}" "a:link {" - "text-decoration: none;" + "text-decoration: none;" "}" "a:visited {" - "color: rgb(44, 55, 66);" + "color: rgb(44, 55, 66);" "}" "a:hover {" - "color: rgb(255, 153, 0);" + "color: rgb(255, 153, 0);" "}" "a:active {" - "color: rgb(204, 122, 0);" + "color: rgb(204, 122, 0);" "}" ".all {" - "max-width:600px;" - "margin: auto;" + "max-width:600px;" + "margin: auto;" "}" ".header {" - "background-color: rgb(30, 30, 30);" - "color: white;" - "padding: 10px;" - "font-weight: bold;" - "margin: 0px;" - "margin-bottom: 10px;" + "background-color: rgb(30, 30, 30);" + "color: white;" + "padding: 10px;" + "font-weight: bold;" + "margin: 0px;" + "margin-bottom: 10px;" "}" ".version {" - "font-size: 75%;" - "text-align: right;" + "font-size: 75%;" + "text-align: right;" "}" ".links {" - "padding: 7px;" - "text-align: center;" - "background-color: rgb(215, 215, 215);" - "box-shadow: 0px 1px 3px 0px rgba(0, 0, 0, 0.2), 0px 1px 1px 0px rgba(0, 0, 0, 0.14), 0px 2px 1px -1px rgba(0, 0, 0, 0.12);" + "padding: 7px;" + "text-align: center;" + "background-color: rgb(215, 215, 215);" + "box-shadow: 0px 1px 3px 0px rgba(0, 0, 0, 0.2), 0px 1px 1px 0px rgba(0, 0, 0, 0.14), 0px 2px 1px -1px rgba(0, 0, 0, 0.12);" "}" ".data th, td {" - "padding: 5px 12px;" - "text-align: right;" - "border-bottom: 1px solid #ccc;" + "padding: 5px 12px;" + "text-align: right;" + "border-bottom: 1px solid #ccc;" "}" ".data tr:nth-child(even) {" - "background-color: #ddd;" + "background-color: #ddd;" "}" ".data th {" - "background-color: #ccc;" + "background-color: #ccc;" "}" ".data table {" - "width: 100%;" - "max-width: 600px;" + "width: 100%;" + "max-width: 600px;" "}" ".letter {" - "font-weight: bold;" + "font-weight: bold;" "}" "h4 {" - "background-color: rgb(0, 130, 130);" - "color: white;" - "padding: 10px;" - "margin: 10px 0px;" + "background-color: rgb(0, 130, 130);" + "color: white;" + "padding: 10px;" + "margin: 10px 0px;" "}" ".flex-container {" - "display: -webkit-flex;" - "display: flex;" + "display: -webkit-flex;" + "display: flex;" "}" ".flex-item {" - "width: 33%;" - "margin: 3px;" + "width: 33%;" + "margin: 3px;" "}" ".motd-box {" - "background-color: #ccc;" - "padding: 0px 10px 5px 10px;" - "margin-bottom: 10px;" + "background-color: #ccc;" + "padding: 0px 10px 5px 10px;" + "margin-bottom: 10px;" "}" ".motd-head {" - "border-bottom: 1px solid #000;" - "margin-bottom: 0.5em;" - "padding: 0.5em 0em;" - "font-weight: bold;" + "border-bottom: 1px solid #000;" + "margin-bottom: 0.5em;" + "padding: 0.5em 0em;" + "font-weight: bold;" "}" ".motd-body {" - "overflow: hidden;" + "overflow: hidden;" "}"; size_t sHtmlCssSize = sizeof(sHtmlCssFile) - 1; @@ -124,7 +124,7 @@ extern const char sHtmlAccessDenied[] = size_t sHtmlAccessDeniedSize = sizeof(sHtmlAccessDenied) - 1; -extern const char sHtmlCommonHeader [] = +extern const char sHtmlCommonHeader[] = "" "" "" @@ -135,15 +135,15 @@ extern const char sHtmlCommonHeader [] = "
XMR-Stak Monero Miner
" "
" - "" - "" - "" + "" + "" + "" "
" "

%s

"; @@ -151,61 +151,61 @@ extern const char sHtmlMotdBoxStart[] = "
"; extern const char sHtmlMotdEntry[] = "
Message from %s
%s
"; extern const char sHtmlMotdBoxEnd[] = "
"; -extern const char sHtmlHashrateBodyHigh [] = +extern const char sHtmlHashrateBodyHigh[] = "
" "" - ""; + ""; -extern const char sHtmlHashrateTableRow [] = +extern const char sHtmlHashrateTableRow[] = ""; -extern const char sHtmlHashrateBodyLow [] = - "" - "" +extern const char sHtmlHashrateBodyLow[] = + "" + "" "
Thread ID10s60s15mH/s
Thread ID10s60s15mH/s
%s%s%s%s
Totals:%s%s%s
Highest:%s
Totals:%s%s%s
Highest:%s
" "
"; -extern const char sHtmlConnectionBodyHigh [] = +extern const char sHtmlConnectionBodyHigh[] = "
" "" - "" - "" - "" - "" + "" + "" + "" + "" "
Rig ID%s
Pool address%s
Connected since%s
Pool ping time%u ms
Rig ID%s
Pool address%s
Connected since%s
Pool ping time%u ms
" "

Network error log

" "" - ""; + ""; -extern const char sHtmlConnectionTableRow [] = +extern const char sHtmlConnectionTableRow[] = ""; -extern const char sHtmlConnectionBodyLow [] = +extern const char sHtmlConnectionBodyLow[] = "
DateError
DateError
%s%s
"; -extern const char sHtmlResultBodyHigh [] = +extern const char sHtmlResultBodyHigh[] = "
" "" - "" - "" - "" - "" - "" + "" + "" + "" + "" + "" "
Currency%s
Difficulty%u
Good results%u / %u (%.1f %%)
Avg result time%.1f sec
Pool-side hashes%u
Currency%s
Difficulty%u
Good results%u / %u (%.1f %%)
Avg result time%.1f sec
Pool-side hashes%u
" "

Top 10 best results found

" "" - "" - "" - "" - "" - "" + "" + "" + "" + "" + "" "
1%llu2%llu
3%llu4%llu
5%llu6%llu
7%llu8%llu
9%llu10%llu
1%llu2%llu
3%llu4%llu
5%llu6%llu
7%llu8%llu
9%llu10%llu
" "

Error details

" "" - "" - ""; + "" + ""; -extern const char sHtmlResultTableRow [] = +extern const char sHtmlResultTableRow[] = ""; extern const char sHtmlResultBodyLow[] = @@ -220,31 +220,30 @@ extern const char sJsonApiResultError[] = extern const char sJsonApiConnectionError[] = "{\"last_seen\":%llu,\"text\":\"%s\"}"; -extern const char sJsonApiFormat [] = -"{" +extern const char sJsonApiFormat[] = + "{" "\"version\":\"%s\"," "\"hashrate\":{" - "\"threads\":[%s]," - "\"total\":%s," - "\"highest\":%s" + "\"threads\":[%s]," + "\"total\":%s," + "\"highest\":%s" "}," "\"results\":{" - "\"diff_current\":%llu," - "\"shares_good\":%llu," - "\"shares_total\":%llu," - "\"avg_time\":%.1f," - "\"hashes_total\":%llu," - "\"best\":[%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu]," - "\"error_log\":[%s]" + "\"diff_current\":%llu," + "\"shares_good\":%llu," + "\"shares_total\":%llu," + "\"avg_time\":%.1f," + "\"hashes_total\":%llu," + "\"best\":[%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu]," + "\"error_log\":[%s]" "}," "\"connection\":{" - "\"pool\": \"%s\"," - "\"uptime\":%llu," - "\"ping\":%llu," - "\"error_log\":[%s]" + "\"pool\": \"%s\"," + "\"uptime\":%llu," + "\"ping\":%llu," + "\"error_log\":[%s]" "}" -"}"; - + "}"; diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 5e3384a63..10082a09f 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -26,16 +26,15 @@ #include "xmrstak/misc/console.hpp" #include "xmrstak/misc/jext.hpp" -#include "xmrstak/misc/console.hpp" #include "xmrstak/misc/utility.hpp" +#include +#include +#include #include #include #include -#include #include -#include -#include #ifdef _WIN32 #define strcasecmp _stricmp @@ -44,18 +43,34 @@ #include #endif - using namespace rapidjson; /* * This enum needs to match index in oConfigValues, otherwise we will get a runtime error */ -enum configEnum { - aPoolList, sCurrency, bTlsSecureAlgo, iCallTimeout, iNetRetry, iGiveUpLimit, iVerboseLevel, bPrintMotd, iAutohashTime, - bDaemonMode, sOutputFile, iHttpdPort, sHttpLogin, sHttpPass, bPreferIpv4, bAesOverride, sUseSlowMem +enum configEnum +{ + aPoolList, + sCurrency, + bTlsSecureAlgo, + iCallTimeout, + iNetRetry, + iGiveUpLimit, + iVerboseLevel, + bPrintMotd, + iAutohashTime, + bDaemonMode, + sOutputFile, + iHttpdPort, + sHttpLogin, + sHttpPass, + bPreferIpv4, + bAesOverride, + sUseSlowMem }; -struct configVal { +struct configVal +{ configEnum iName; const char* sName; Type iType; @@ -64,68 +79,66 @@ struct configVal { // Same order as in configEnum, as per comment above // kNullType means any type configVal oConfigValues[] = { - { aPoolList, "pool_list", kArrayType }, - { sCurrency, "currency", kStringType }, - { bTlsSecureAlgo, "tls_secure_algo", kTrueType }, - { iCallTimeout, "call_timeout", kNumberType }, - { iNetRetry, "retry_time", kNumberType }, - { iGiveUpLimit, "giveup_limit", kNumberType }, - { iVerboseLevel, "verbose_level", kNumberType }, - { bPrintMotd, "print_motd", kTrueType }, - { iAutohashTime, "h_print_time", kNumberType }, - { bDaemonMode, "daemon_mode", kTrueType }, - { sOutputFile, "output_file", kStringType }, - { iHttpdPort, "httpd_port", kNumberType }, - { sHttpLogin, "http_login", kStringType }, - { sHttpPass, "http_pass", kStringType }, - { bPreferIpv4, "prefer_ipv4", kTrueType }, - { bAesOverride, "aes_override", kNullType }, - { sUseSlowMem, "use_slow_memory", kStringType } -}; - -constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); + {aPoolList, "pool_list", kArrayType}, + {sCurrency, "currency", kStringType}, + {bTlsSecureAlgo, "tls_secure_algo", kTrueType}, + {iCallTimeout, "call_timeout", kNumberType}, + {iNetRetry, "retry_time", kNumberType}, + {iGiveUpLimit, "giveup_limit", kNumberType}, + {iVerboseLevel, "verbose_level", kNumberType}, + {bPrintMotd, "print_motd", kTrueType}, + {iAutohashTime, "h_print_time", kNumberType}, + {bDaemonMode, "daemon_mode", kTrueType}, + {sOutputFile, "output_file", kStringType}, + {iHttpdPort, "httpd_port", kNumberType}, + {sHttpLogin, "http_login", kStringType}, + {sHttpPass, "http_pass", kStringType}, + {bPreferIpv4, "prefer_ipv4", kTrueType}, + {bAesOverride, "aes_override", kNullType}, + {sUseSlowMem, "use_slow_memory", kStringType}}; + +constexpr size_t iConfigCnt = (sizeof(oConfigValues) / sizeof(oConfigValues[0])); xmrstak::coin_selection coins[] = { // name, userpool, devpool, default_pool_suggestion - { "aeon7", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, "mine.aeon-pool.com:5555" }, - { "bbscoin", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr }, - { "bittube", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, "mining.bit.tube:13333" }, - { "cryptonight", {POW(cryptonight)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_bittube2", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_masari", {POW(cryptonight_masari)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_heavy", {POW(cryptonight_heavy)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_lite", {POW(cryptonight_lite)}, {POW(cryptonight_aeon)}, nullptr }, - { "cryptonight_lite_v7", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr }, - { "cryptonight_lite_v7_xor", {POW(cryptonight_ipbc)}, {POW(cryptonight_aeon)}, nullptr }, - { "cryptonight_r", {POW(cryptonight_r)}, {POW(cryptonight_r)}, nullptr }, - { "cryptonight_superfast", {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_turtle", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr }, - { "cryptonight_v7", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_v8", {POW(cryptonight_monero_v8)}, {POW(cryptonight_r)}, nullptr }, - { "cryptonight_v8_double", {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_v8_half", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_v8_reversewaltz", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)},{POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_v7_stellite", {POW(cryptonight_stellite)}, {POW(cryptonight_gpu)}, nullptr }, - { "cryptonight_gpu", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333" }, - { "cryptonight_conceal", {POW(cryptonight_conceal)}, {POW(cryptonight_gpu)}, nullptr }, - { "freehaven", {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr }, - { "graft", {POW(cryptonight_v8_reversewaltz), 12, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr }, - { "haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr }, - { "lethean", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr }, - { "masari", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr }, - { "monero", {POW(cryptonight_r)}, {POW(cryptonight_r)}, "pool.usxmrpool.com:3333" }, - { "qrl", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr }, - { "ryo", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333" }, - { "stellite", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr }, - { "turtlecoin", {POW(cryptonight_turtle), 6u,POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr }, - { "plenteum", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr }, - { "zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr }, - { "xcash", {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr } -}; - -constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0])); + {"aeon7", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, "mine.aeon-pool.com:5555"}, + {"bbscoin", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr}, + {"bittube", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, "mining.bit.tube:13333"}, + {"cryptonight", {POW(cryptonight)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_bittube2", {POW(cryptonight_bittube2)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_masari", {POW(cryptonight_masari)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_heavy", {POW(cryptonight_heavy)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_lite", {POW(cryptonight_lite)}, {POW(cryptonight_aeon)}, nullptr}, + {"cryptonight_lite_v7", {POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr}, + {"cryptonight_lite_v7_xor", {POW(cryptonight_ipbc)}, {POW(cryptonight_aeon)}, nullptr}, + {"cryptonight_r", {POW(cryptonight_r)}, {POW(cryptonight_r)}, nullptr}, + {"cryptonight_superfast", {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_turtle", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr}, + {"cryptonight_v7", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_v8", {POW(cryptonight_monero_v8)}, {POW(cryptonight_r)}, nullptr}, + {"cryptonight_v8_double", {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_v8_half", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_v8_reversewaltz", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_v7_stellite", {POW(cryptonight_stellite)}, {POW(cryptonight_gpu)}, nullptr}, + {"cryptonight_gpu", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333"}, + {"cryptonight_conceal", {POW(cryptonight_conceal)}, {POW(cryptonight_gpu)}, nullptr}, + {"freehaven", {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr}, + {"graft", {POW(cryptonight_v8_reversewaltz), 12, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr}, + {"haven", {POW(cryptonight_haven)}, {POW(cryptonight_gpu)}, nullptr}, + {"lethean", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr}, + {"masari", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr}, + {"monero", {POW(cryptonight_r)}, {POW(cryptonight_r)}, "pool.usxmrpool.com:3333"}, + {"qrl", {POW(cryptonight_monero)}, {POW(cryptonight_gpu)}, nullptr}, + {"ryo", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333"}, + {"stellite", {POW(cryptonight_v8_half)}, {POW(cryptonight_gpu)}, nullptr}, + {"turtlecoin", {POW(cryptonight_turtle), 6u, POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr}, + {"plenteum", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr}, + {"zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr}, + {"xcash", {POW(cryptonight_v8_double)}, {POW(cryptonight_gpu)}, nullptr}}; + +constexpr size_t coin_algo_size = (sizeof(coins) / sizeof(coins[0])); inline bool checkType(Type have, Type want) { @@ -275,7 +288,7 @@ const char* jconf::GetOutputFile() void jconf::cpuid(uint32_t eax, int32_t ecx, int32_t val[4]) { - memset(val, 0, sizeof(int32_t)*4); + memset(val, 0, sizeof(int32_t) * 4); #ifdef _WIN32 __cpuidex(val, eax, ecx); @@ -326,7 +339,7 @@ std::string jconf::GetMiningCoin() void jconf::GetAlgoList(std::string& list) { list.reserve(256); - for(size_t i=0; i < coin_algo_size; i++) + for(size_t i = 0; i < coin_algo_size; i++) { list += "\t- "; list += coins[i].coin_name; @@ -338,7 +351,7 @@ bool jconf::IsOnAlgoList(std::string& needle) { std::transform(needle.begin(), needle.end(), needle.begin(), ::tolower); - for(size_t i=0; i < coin_algo_size; i++) + for(size_t i = 0; i < coin_algo_size; i++) { if(needle == coins[i].coin_name) return true; @@ -350,7 +363,7 @@ const char* jconf::GetDefaultPool(const char* needle) { const char* default_example = "pool.example.com:3333"; - for(size_t i=0; i < coin_algo_size; i++) + for(size_t i = 0; i < coin_algo_size; i++) { if(strcmp(needle, coins[i].coin_name) == 0) { @@ -366,22 +379,22 @@ const char* jconf::GetDefaultPool(const char* needle) bool jconf::parse_file(const char* sFilename, bool main_conf) { - FILE * pFile; - char * buffer; + FILE* pFile; + char* buffer; size_t flen; pFile = fopen(sFilename, "rb"); - if (pFile == NULL) + if(pFile == NULL) { printer::inst()->print_msg(L0, "Failed to open config file %s.", sFilename); return false; } - fseek(pFile,0,SEEK_END); + fseek(pFile, 0, SEEK_END); flen = ftell(pFile); rewind(pFile); - if(flen >= 64*1024) + if(flen >= 64 * 1024) { fclose(pFile); printer::inst()->print_msg(L0, "Oversized config file - %s.", sFilename); @@ -396,7 +409,7 @@ bool jconf::parse_file(const char* sFilename, bool main_conf) } buffer = (char*)malloc(flen + 3); - if(fread(buffer+1, flen, 1, pFile) != 1) + if(fread(buffer + 1, flen, 1, pFile) != 1) { free(buffer); fclose(pFile); @@ -420,7 +433,7 @@ bool jconf::parse_file(const char* sFilename, bool main_conf) Document& root = main_conf ? prv->jsonDoc : prv->jsonDocPools; - root.Parse(buffer, flen+2); + root.Parse(buffer, flen + 2); free(buffer); if(root.HasParseError()) @@ -514,11 +527,11 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools) std::vector pool_weights; pool_weights.reserve(pool_cnt); - const char* aPoolValues[] = { "pool_address", "wallet_address", "rig_id", "pool_password", "use_nicehash", "use_tls", "tls_fingerprint", "pool_weight" }; - Type poolValTypes[] = { kStringType, kStringType, kStringType, kStringType, kTrueType, kTrueType, kStringType, kNumberType }; + const char* aPoolValues[] = {"pool_address", "wallet_address", "rig_id", "pool_password", "use_nicehash", "use_tls", "tls_fingerprint", "pool_weight"}; + Type poolValTypes[] = {kStringType, kStringType, kStringType, kStringType, kTrueType, kTrueType, kStringType, kNumberType}; - constexpr size_t pvcnt = sizeof(aPoolValues)/sizeof(aPoolValues[0]); - for(uint32_t i=0; i < pool_cnt; i++) + constexpr size_t pvcnt = sizeof(aPoolValues) / sizeof(aPoolValues[0]); + for(uint32_t i = 0; i < pool_cnt; i++) { const Value& oThdConf = prv->configValues[aPoolList]->GetArray()[i]; @@ -528,7 +541,7 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools) return false; } - for(uint32_t j=0; j < pvcnt; j++) + for(uint32_t j = 0; j < pvcnt; j++) { const Value* v; if((v = GetObjectMember(oThdConf, aPoolValues[j])) == nullptr) @@ -620,7 +633,7 @@ bool jconf::parse_config(const char* sFilename, const char* sFilenamePools) return false; } - for(size_t i=0; i < coin_algo_size; i++) + for(size_t i = 0; i < coin_algo_size; i++) { if(ctmp == coins[i].coin_name) { diff --git a/xmrstak/jconf.hpp b/xmrstak/jconf.hpp index 102b70f54..5597bf23e 100644 --- a/xmrstak/jconf.hpp +++ b/xmrstak/jconf.hpp @@ -1,15 +1,15 @@ #pragma once -#include "xmrstak/misc/environment.hpp" -#include "xmrstak/misc/coinDescription.hpp" #include "params.hpp" +#include "xmrstak/misc/coinDescription.hpp" +#include "xmrstak/misc/environment.hpp" #include #include class jconf { -public: + public: static jconf* inst() { auto& env = xmrstak::environment::inst(); @@ -20,7 +20,8 @@ class jconf bool parse_config(const char* sFilename, const char* sFilenamePools); - struct pool_cfg { + struct pool_cfg + { const char* sPoolAddr; const char* sWalletAddr; const char* sRigId; @@ -38,7 +39,8 @@ class jconf uint64_t GetPoolCount(); bool GetPoolConfig(size_t id, pool_cfg& cfg); - enum slow_mem_cfg { + enum slow_mem_cfg + { always_use, no_mlck, print_warning, @@ -80,7 +82,7 @@ class jconf slow_mem_cfg GetSlowMemSetting(); -private: + private: jconf(); bool parse_file(const char* sFilename, bool main_conf); diff --git a/xmrstak/misc/coinDescription.hpp b/xmrstak/misc/coinDescription.hpp index 65dee143c..b3b119226 100644 --- a/xmrstak/misc/coinDescription.hpp +++ b/xmrstak/misc/coinDescription.hpp @@ -2,86 +2,88 @@ #include "xmrstak/backend/cryptonight.hpp" +#include #include #include #include -#include namespace xmrstak { - struct coinDescription - { - xmrstak_algo algo = {xmrstak_algo_id::invalid_algo}; - uint8_t fork_version = 0u; - xmrstak_algo algo_root = {xmrstak_algo_id::invalid_algo}; +struct coinDescription +{ + xmrstak_algo algo = {xmrstak_algo_id::invalid_algo}; + uint8_t fork_version = 0u; + xmrstak_algo algo_root = {xmrstak_algo_id::invalid_algo}; - coinDescription() = default; + coinDescription() = default; - coinDescription( - const xmrstak_algo in_algo, - const uint8_t in_fork_version = 0, - xmrstak_algo in_algo_root = xmrstak_algo_id::invalid_algo - ) : - algo(in_algo), algo_root(in_algo_root), fork_version(in_fork_version) - {} + coinDescription( + const xmrstak_algo in_algo, + const uint8_t in_fork_version = 0, + xmrstak_algo in_algo_root = xmrstak_algo_id::invalid_algo) : + algo(in_algo), + algo_root(in_algo_root), + fork_version(in_fork_version) + { + } - inline xmrstak_algo GetMiningAlgo() const { return algo; } - inline xmrstak_algo GetMiningAlgoRoot() const { return algo_root; } - inline uint8_t GetMiningForkVersion() const { return fork_version; } - }; + inline xmrstak_algo GetMiningAlgo() const { return algo; } + inline xmrstak_algo GetMiningAlgoRoot() const { return algo_root; } + inline uint8_t GetMiningForkVersion() const { return fork_version; } +}; - struct coin_selection - { - const char* coin_name = nullptr; - /* [0] -> user pool +struct coin_selection +{ + const char* coin_name = nullptr; + /* [0] -> user pool * [1] -> dev pool */ - coinDescription pool_coin[2]; - const char* default_pool = nullptr; + coinDescription pool_coin[2]; + const char* default_pool = nullptr; - coin_selection() = default; + coin_selection() = default; - coin_selection( - const char* in_coin_name, - const coinDescription user_coinDescription, - const coinDescription dev_coinDescription, - const char* in_default_pool - ) : - coin_name(in_coin_name), default_pool(in_default_pool) - { - pool_coin[0] = user_coinDescription; - pool_coin[1] = dev_coinDescription; - } + coin_selection( + const char* in_coin_name, + const coinDescription user_coinDescription, + const coinDescription dev_coinDescription, + const char* in_default_pool) : + coin_name(in_coin_name), + default_pool(in_default_pool) + { + pool_coin[0] = user_coinDescription; + pool_coin[1] = dev_coinDescription; + } - /** get coin description for the pool + /** get coin description for the pool * * @param poolId 0 select dev pool, else the user pool is selected */ - inline coinDescription GetDescription(size_t poolId) const { - coinDescription tmp = (poolId == 0 ? pool_coin[1] : pool_coin[0]); - return tmp; - } + inline coinDescription GetDescription(size_t poolId) const + { + coinDescription tmp = (poolId == 0 ? pool_coin[1] : pool_coin[0]); + return tmp; + } - /** return all POW algorithm for the current selected currency + /** return all POW algorithm for the current selected currency * * @return required POW algorithms without duplicated entries */ - inline std::vector GetAllAlgorithms() - { - std::vector allAlgos = { - GetDescription(0).GetMiningAlgo(), - GetDescription(0).GetMiningAlgoRoot(), - GetDescription(1).GetMiningAlgo(), - GetDescription(1).GetMiningAlgoRoot() - }; + inline std::vector GetAllAlgorithms() + { + std::vector allAlgos = { + GetDescription(0).GetMiningAlgo(), + GetDescription(0).GetMiningAlgoRoot(), + GetDescription(1).GetMiningAlgo(), + GetDescription(1).GetMiningAlgoRoot()}; - std::sort(allAlgos.begin(), allAlgos.end()); - std::remove(allAlgos.begin(), allAlgos.end(), invalid_algo); - auto last = std::unique(allAlgos.begin(), allAlgos.end()); - // remove duplicated algorithms - allAlgos.erase(last, allAlgos.end()); + std::sort(allAlgos.begin(), allAlgos.end()); + std::remove(allAlgos.begin(), allAlgos.end(), invalid_algo); + auto last = std::unique(allAlgos.begin(), allAlgos.end()); + // remove duplicated algorithms + allAlgos.erase(last, allAlgos.end()); - return allAlgos; - } - }; + return allAlgos; + } +}; } // namespace xmrstak diff --git a/xmrstak/misc/configEditor.hpp b/xmrstak/misc/configEditor.hpp index 3f79df44c..ae81f62c5 100644 --- a/xmrstak/misc/configEditor.hpp +++ b/xmrstak/misc/configEditor.hpp @@ -1,10 +1,10 @@ #pragma once #include -#include #include -#include #include +#include +#include #include "../version.hpp" @@ -17,16 +17,15 @@ struct configEditor configEditor() { - } - static bool file_exist( const std::string filename) + static bool file_exist(const std::string filename) { std::ifstream fstream(filename); return fstream.good(); } - void set( const std::string && content) + void set(const std::string&& content) { m_fileContent = content; } @@ -36,8 +35,7 @@ struct configEditor std::ifstream fstream(filename); m_fileContent = std::string( (std::istreambuf_iterator(fstream)), - std::istreambuf_iterator() - ); + std::istreambuf_iterator()); return fstream.good(); } @@ -70,7 +68,6 @@ struct configEditor { m_fileContent = std::regex_replace(m_fileContent, std::regex(search), substring); } - }; } // namespace xmrstak diff --git a/xmrstak/misc/console.cpp b/xmrstak/misc/console.cpp index c39237eab..529cc9453 100644 --- a/xmrstak/misc/console.cpp +++ b/xmrstak/misc/console.cpp @@ -23,11 +23,11 @@ #include "xmrstak/misc/console.hpp" -#include +#include +#include #include #include -#include -#include +#include #ifdef _WIN32 #include @@ -37,15 +37,15 @@ int get_key() DWORD mode, rd; HANDLE h; - if ((h = GetStdHandle(STD_INPUT_HANDLE)) == NULL) + if((h = GetStdHandle(STD_INPUT_HANDLE)) == NULL) return -1; - GetConsoleMode( h, &mode ); - SetConsoleMode( h, mode & ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT) ); + GetConsoleMode(h, &mode); + SetConsoleMode(h, mode & ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT)); int c = 0; - ReadConsole( h, &c, 1, &rd, NULL ); - SetConsoleMode( h, mode ); + ReadConsole(h, &c, 1, &rd, NULL); + SetConsoleMode(h, mode); return c; } @@ -90,20 +90,20 @@ void reset_colour() } #else +#include #include #include -#include int get_key() { struct termios oldattr, newattr; int ch; - tcgetattr( STDIN_FILENO, &oldattr ); + tcgetattr(STDIN_FILENO, &oldattr); newattr = oldattr; - newattr.c_lflag &= ~( ICANON | ECHO ); - tcsetattr( STDIN_FILENO, TCSANOW, &newattr ); + newattr.c_lflag &= ~(ICANON | ECHO); + tcsetattr(STDIN_FILENO, TCSANOW, &newattr); ch = getchar(); - tcsetattr( STDIN_FILENO, TCSANOW, &oldattr ); + tcsetattr(STDIN_FILENO, TCSANOW, &oldattr); return ch; } @@ -182,17 +182,17 @@ void printer::print_msg(verbosity verbose, const char* fmt, ...) va_list args; va_start(args, fmt); - vsnprintf(buf+bpos, sizeof(buf)-bpos, fmt, args); + vsnprintf(buf + bpos, sizeof(buf) - bpos, fmt, args); va_end(args); bpos = strlen(buf); - if(bpos+2 >= sizeof(buf)) + if(bpos + 2 >= sizeof(buf)) return; buf[bpos] = '\n'; - buf[bpos+1] = '\0'; + buf[bpos + 1] = '\0'; - print_str(buf); + print_str(buf); } void printer::print_str(const char* str) diff --git a/xmrstak/misc/console.hpp b/xmrstak/misc/console.hpp index 6df6597c6..3c27ee86b 100644 --- a/xmrstak/misc/console.hpp +++ b/xmrstak/misc/console.hpp @@ -4,8 +4,17 @@ #include - -enum out_colours { K_RED, K_GREEN, K_BLUE, K_YELLOW, K_CYAN, K_MAGENTA, K_WHITE, K_NONE }; +enum out_colours +{ + K_RED, + K_GREEN, + K_BLUE, + K_YELLOW, + K_CYAN, + K_MAGENTA, + K_WHITE, + K_NONE +}; // Warning - on Linux get_key will detect control keys, but not on Windows. // We will only use it for alphanum keys anyway. @@ -21,11 +30,20 @@ inline long long unsigned int int_port(size_t i) return i; } -enum verbosity : size_t { L0 = 0, L1 = 1, L2 = 2, L3 = 3, L4 = 4, LDEBUG = 10, LINF = 100}; +enum verbosity : size_t +{ + L0 = 0, + L1 = 1, + L2 = 2, + L3 = 3, + L4 = 4, + LDEBUG = 10, + LINF = 100 +}; class printer { -public: + public: static inline printer* inst() { auto& env = xmrstak::environment::inst(); @@ -39,7 +57,7 @@ class printer void print_str(const char* str); bool open_logfile(const char* file); -private: + private: printer(); std::mutex print_mutex; diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp index 83c92e058..cbc817cd9 100644 --- a/xmrstak/misc/executor.cpp +++ b/xmrstak/misc/executor.cpp @@ -21,31 +21,30 @@ * */ -#include "xmrstak/jconf.hpp" #include "executor.hpp" +#include "xmrstak/jconf.hpp" #include "xmrstak/net/jpsock.hpp" #include "telemetry.hpp" -#include "xmrstak/backend/miner_work.hpp" -#include "xmrstak/backend/globalStates.hpp" #include "xmrstak/backend/backendConnector.hpp" +#include "xmrstak/backend/globalStates.hpp" #include "xmrstak/backend/iBackend.hpp" +#include "xmrstak/backend/miner_work.hpp" +#include "xmrstak/donate-level.hpp" +#include "xmrstak/http/webdesign.hpp" #include "xmrstak/jconf.hpp" #include "xmrstak/misc/console.hpp" -#include "xmrstak/donate-level.hpp" #include "xmrstak/version.hpp" -#include "xmrstak/http/webdesign.hpp" -#include -#include -#include #include -#include #include +#include +#include +#include +#include #include - #ifdef _WIN32 #define strncasecmp _strnicmp #endif // _WIN32 @@ -63,7 +62,7 @@ void executor::push_timed_event(ex_event&& ev, size_t sec) void executor::ex_clock_thd() { size_t tick = 0; - while (true) + while(true) { std::this_thread::sleep_for(std::chrono::milliseconds(size_t(iTickTime))); @@ -76,7 +75,7 @@ void executor::ex_clock_thd() // Service timed events std::unique_lock lck(timed_event_mutex); std::list::iterator ev = lTimedEvents.begin(); - while (ev != lTimedEvents.end()) + while(ev != lTimedEvents.end()) { ev->ticks_left--; if(ev->ticks_left == 0) @@ -96,7 +95,8 @@ bool executor::get_live_pools(std::vector& eval_pools, bool is_dev) size_t limit = jconf::inst()->GetGiveUpLimit(); size_t wait = jconf::inst()->GetNetRetry(); - if(limit == 0 || is_dev) limit = (-1); //No limit = limit of 2^64-1 + if(limit == 0 || is_dev) + limit = (-1); //No limit = limit of 2^64-1 size_t pool_count = 0; size_t over_limit = 0; @@ -329,7 +329,7 @@ void executor::on_sock_ready(size_t pool_id) { if(pool->have_call_error() && !pool->is_dev_pool()) { - std::string str = "Login error: " + pool->get_call_error(); + std::string str = "Login error: " + pool->get_call_error(); log_socket_error(pool, std::move(str)); } @@ -368,7 +368,8 @@ void executor::on_pool_have_job(size_t pool_id, pool_job& oPoolJob) dat.pool_id = pool_id; xmrstak::globalStates::inst().switch_work(xmrstak::miner_work(oPoolJob.sJobID, oPoolJob.bWorkBlob, - oPoolJob.iWorkLen, oPoolJob.iTarget, pool->is_nicehash(), pool_id, oPoolJob.iBlockHeight), dat); + oPoolJob.iWorkLen, oPoolJob.iTarget, pool->is_nicehash(), pool_id, oPoolJob.iBlockHeight), + dat); if(dat.pool_id != pool_id) { @@ -419,12 +420,11 @@ void executor::on_miner_result(size_t pool_id, job_result& oResult) //Ignore errors silently if(pool->is_running() && pool->is_logged_in()) pool->cmd_submit(oResult.sJobID, oResult.iNonce, oResult.bResult, backend_name, - backend_hashcount, total_hashcount, oResult.algorithm - ); + backend_hashcount, total_hashcount, oResult.algorithm); return; } - if (!pool->is_running() || !pool->is_logged_in()) + if(!pool->is_running() || !pool->is_logged_in()) { log_result_error("[NETWORK ERROR]"); return; @@ -432,8 +432,7 @@ void executor::on_miner_result(size_t pool_id, job_result& oResult) size_t t_start = get_timestamp_ms(); bool bResult = pool->cmd_submit(oResult.sJobID, oResult.iNonce, oResult.bResult, - backend_name, backend_hashcount, total_hashcount, oResult.algorithm - ); + backend_name, backend_hashcount, total_hashcount, oResult.algorithm); size_t t_len = get_timestamp_ms() - t_start; if(t_len > 0xFFFF) @@ -476,12 +475,14 @@ void disable_sigpipe() memset(&sa, 0, sizeof(sa)); sa.sa_handler = SIG_IGN; sa.sa_flags = 0; - if (sigaction(SIGPIPE, &sa, 0) == -1) + if(sigaction(SIGPIPE, &sa, 0) == -1) printer::inst()->print_msg(L1, "ERROR: Call to sigaction failed!"); } #else -inline void disable_sigpipe() {} +inline void disable_sigpipe() +{ +} #endif void executor::ex_main() @@ -495,7 +496,7 @@ void executor::ex_main() // \todo collect all backend threads pvThreads = xmrstak::BackendConnector::thread_starter(oWork); - if(pvThreads->size()==0) + if(pvThreads->size() == 0) { printer::inst()->print_msg(L1, "ERROR: No miner backend enabled."); win_exit(); @@ -507,11 +508,11 @@ void executor::ex_main() size_t pc = jconf::inst()->GetPoolCount(); bool dev_tls = true; bool already_have_cli_pool = false; - size_t i=0; + size_t i = 0; for(; i < pc; i++) { jconf::pool_cfg cfg; - jconf::inst()->GetPoolConfig(i, cfg); + jconf::inst()->GetPoolConfig(i, cfg); #ifdef CONF_NO_TLS if(cfg.tls) { @@ -519,7 +520,8 @@ void executor::ex_main() win_exit(); } #endif - if(!cfg.tls) dev_tls = false; + if(!cfg.tls) + dev_tls = false; if(!xmrstak::params::inst().poolURL.empty() && xmrstak::params::inst().poolURL == cfg.sPoolAddr) { @@ -531,10 +533,10 @@ void executor::ex_main() const char* pwd = params.userSetPwd ? params.poolPasswd.c_str() : cfg.sPasswd; bool nicehash = cfg.nicehash || params.nicehashMode; - pools.emplace_back(i+1, cfg.sPoolAddr, wallet, rigid, pwd, 9.9, false, params.poolUseTls, cfg.tls_fingerprint, nicehash); + pools.emplace_back(i + 1, cfg.sPoolAddr, wallet, rigid, pwd, 9.9, false, params.poolUseTls, cfg.tls_fingerprint, nicehash); } else - pools.emplace_back(i+1, cfg.sPoolAddr, cfg.sWalletAddr, cfg.sRigId, cfg.sPasswd, cfg.weight, false, cfg.tls, cfg.tls_fingerprint, cfg.nicehash); + pools.emplace_back(i + 1, cfg.sPoolAddr, cfg.sWalletAddr, cfg.sRigId, cfg.sPasswd, cfg.weight, false, cfg.tls, cfg.tls_fingerprint, cfg.nicehash); } if(!xmrstak::params::inst().poolURL.empty() && !already_have_cli_pool) @@ -546,7 +548,7 @@ void executor::ex_main() win_exit(); } - pools.emplace_back(i+1, params.poolURL.c_str(), params.poolUsername.c_str(), params.poolRigid.c_str(), params.poolPasswd.c_str(), 9.9, false, params.poolUseTls, "", params.nicehashMode); + pools.emplace_back(i + 1, params.poolURL.c_str(), params.poolUsername.c_str(), params.poolRigid.c_str(), params.poolPasswd.c_str(), 9.9, false, params.poolUseTls, "", params.nicehashMode); } switch(jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo()) @@ -598,10 +600,10 @@ void executor::ex_main() push_timed_event(ex_event(EV_HASHRATE_LOOP), jconf::inst()->GetAutohashTime()); size_t cnt = 0; - while (true) + while(true) { ev = oEventQ.pop(); - switch (ev.iName) + switch(ev.iName) { case EV_SOCK_READY: on_sock_ready(ev.iPoolId); @@ -632,9 +634,9 @@ void executor::ex_main() } case EV_PERF_TICK: - for (i = 0; i < pvThreads->size(); i++) + for(i = 0; i < pvThreads->size(); i++) telem->push_perf_value(i, pvThreads->at(i)->iHashCount.load(std::memory_order_relaxed), - pvThreads->at(i)->iTimestamp.load(std::memory_order_relaxed)); + pvThreads->at(i)->iTimestamp.load(std::memory_order_relaxed)); if((cnt++ & 0xF) == 0) //Every 16 ticks { @@ -642,7 +644,7 @@ void executor::ex_main() double fTelem; bool normal = true; - for (i = 0; i < pvThreads->size(); i++) + for(i = 0; i < pvThreads->size(); i++) { fTelem = telem->calc_telemetry_data(10000, i); if(std::isnormal(fTelem)) @@ -703,7 +705,7 @@ bool executor::motd_filter_console(std::string& motd) if(motd.size() > motd_max_length) return false; - motd.erase(std::remove_if(motd.begin(), motd.end(), [](int chr)->bool { return !((chr >= 0x20 && chr <= 0x7e) || chr == '\n');}), motd.end()); + motd.erase(std::remove_if(motd.begin(), motd.end(), [](int chr) -> bool { return !((chr >= 0x20 && chr <= 0x7e) || chr == '\n'); }), motd.end()); return motd.size() > 0; } @@ -715,7 +717,7 @@ bool executor::motd_filter_web(std::string& motd) std::string tmp; tmp.reserve(motd.size() + 128); - for(size_t i=0; i < motd.size(); i++) + for(size_t i = 0; i < motd.size(); i++) { char c = motd[i]; switch(c) @@ -768,17 +770,15 @@ void executor::hashrate_report(std::string& out) } char num[32]; - double fTotal[3] = { 0.0, 0.0, 0.0}; + double fTotal[3] = {0.0, 0.0, 0.0}; - for( uint32_t b = 0; b < 4u; ++b) + for(uint32_t b = 0; b < 4u; ++b) { std::vector backEnds; std::copy_if(pvThreads->begin(), pvThreads->end(), std::back_inserter(backEnds), - [&](xmrstak::iBackend* backend) - { + [&](xmrstak::iBackend* backend) { return backend->backendType == b; - } - ); + }); size_t nthd = backEnds.size(); if(nthd != 0) @@ -795,8 +795,8 @@ void executor::hashrate_report(std::string& out) else out.append(1, '\n'); - double fTotalCur[3] = { 0.0, 0.0, 0.0}; - for (i = 0; i < nthd; i++) + double fTotalCur[3] = {0.0, 0.0, 0.0}; + for(i = 0; i < nthd; i++) { double fHps[3]; @@ -877,12 +877,11 @@ void executor::result_report(std::string& out) size_t iGoodRes = vMineResults[0].count, iTotalRes = iGoodRes; size_t ln = vMineResults.size(); - for(size_t i=1; i < ln; i++) + for(size_t i = 1; i < ln; i++) iTotalRes += vMineResults[i].count; out.append("RESULT REPORT\n"); - out.append("Currency : "). - append(jconf::inst()->GetMiningCoin()).append("\n"); + out.append("Currency : ").append(jconf::inst()->GetMiningCoin()).append("\n"); if(iTotalRes == 0) { out.append("You haven't found any results yet.\n"); @@ -898,8 +897,7 @@ void executor::result_report(std::string& out) snprintf(num, sizeof(num), " (%.1f %%)\n", 100.0 * iGoodRes / iTotalRes); out.append("Difficulty : ").append(std::to_string(iPoolDiff)).append(1, '\n'); - out.append("Good results : ").append(std::to_string(iGoodRes)).append(" / "). - append(std::to_string(iTotalRes)).append(num); + out.append("Good results : ").append(std::to_string(iGoodRes)).append(" / ").append(std::to_string(iTotalRes)).append(num); if(iPoolCallTimes.size() != 0) { @@ -910,10 +908,10 @@ void executor::result_report(std::string& out) out.append("Pool-side hashes : ").append(std::to_string(iPoolHashes)).append(2, '\n'); out.append("Top 10 best results found:\n"); - for(size_t i=0; i < 10; i += 2) + for(size_t i = 0; i < 10; i += 2) { snprintf(num, sizeof(num), "| %2llu | %16llu | %2llu | %16llu |\n", - int_port(i), int_port(iTopDiff[i]), int_port(i+1), int_port(iTopDiff[i+1])); + int_port(i), int_port(iTopDiff[i]), int_port(i + 1), int_port(iTopDiff[i + 1])); out.append(num); } @@ -921,7 +919,7 @@ void executor::result_report(std::string& out) if(ln > 1) { out.append("| Count | Error text | Last seen |\n"); - for(size_t i=1; i < ln; i++) + for(size_t i = 1; i < ln; i++) { snprintf(num, sizeof(num), "| %5llu | %-32.32s | %s |\n", int_port(vMineResults[i].count), vMineResults[i].msg.c_str(), time_format(date, sizeof(date), vMineResults[i].time)); @@ -952,11 +950,11 @@ void executor::connection_report(std::string& out) out.append("Connected since : \n"); size_t n_calls = iPoolCallTimes.size(); - if (n_calls > 1) + if(n_calls > 1) { //Not-really-but-good-enough median - std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls/2, iPoolCallTimes.end()); - out.append("Pool ping time : ").append(std::to_string(iPoolCallTimes[n_calls/2])).append(" ms\n"); + std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls / 2, iPoolCallTimes.end()); + out.append("Pool ping time : ").append(std::to_string(iPoolCallTimes[n_calls / 2])).append(" ms\n"); } else out.append("Pool ping time : (n/a)\n"); @@ -966,7 +964,7 @@ void executor::connection_report(std::string& out) if(ln > 0) { out.append("| Date | Error text |\n"); - for(size_t i=0; i < ln; i++) + for(size_t i = 0; i < ln; i++) { snprintf(num, sizeof(num), "| %s | %-54.54s |\n", time_format(date, sizeof(date), vSocketLog[i].time), vSocketLog[i].msg.c_str()); @@ -1039,11 +1037,11 @@ void executor::http_hashrate_report(std::string& out) snprintf(buffer, sizeof(buffer), sHtmlHashrateBodyHigh, (unsigned int)nthd + 3); out.append(buffer); - double fTotal[3] = { 0.0, 0.0, 0.0}; + double fTotal[3] = {0.0, 0.0, 0.0}; auto bTypePrev = static_cast(0); std::string name; size_t j = 0; - for(size_t i=0; i < nthd; i++) + for(size_t i = 0; i < nthd; i++) { double fHps[3]; char csThreadTag[25]; @@ -1059,14 +1057,13 @@ void executor::http_hashrate_report(std::string& out) } snprintf(csThreadTag, sizeof(csThreadTag), (99 < nthd) ? "[%s.%03u]:%03u" : ((9 < nthd) ? "[%s.%02u]:%02u" : "[%s.%u]:%u"), - name.c_str(), (unsigned int)(j), (unsigned int)i - ); + name.c_str(), (unsigned int)(j), (unsigned int)i); fHps[0] = telem->calc_telemetry_data(10000, i); fHps[1] = telem->calc_telemetry_data(60000, i); fHps[2] = telem->calc_telemetry_data(900000, i); - num_a[0] = num_b[0] = num_c[0] ='\0'; + num_a[0] = num_b[0] = num_c[0] = '\0'; hps_format(fHps[0], num_a, sizeof(num_a)); hps_format(fHps[1], num_b, sizeof(num_b)); hps_format(fHps[2], num_c, sizeof(num_c)); @@ -1079,7 +1076,7 @@ void executor::http_hashrate_report(std::string& out) out.append(buffer); } - num_a[0] = num_b[0] = num_c[0] = num_d[0] ='\0'; + num_a[0] = num_b[0] = num_c[0] = num_d[0] = '\0'; hps_format(fTotal[0], num_a, sizeof(num_a)); hps_format(fTotal[1], num_b, sizeof(num_b)); hps_format(fTotal[2], num_c, sizeof(num_c)); @@ -1096,13 +1093,13 @@ void executor::http_result_report(std::string& out) out.reserve(4096); - snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Result Report", ver_html, "Result Report"); + snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Result Report", ver_html, "Result Report"); out.append(buffer); size_t iGoodRes = vMineResults[0].count, iTotalRes = iGoodRes; size_t ln = vMineResults.size(); - for(size_t i=1; i < ln; i++) + for(size_t i = 1; i < ln; i++) iTotalRes += vMineResults[i].count; double fGoodResPrc = 0.0; @@ -1113,8 +1110,7 @@ void executor::http_result_report(std::string& out) if(iPoolCallTimes.size() > 0) { using namespace std::chrono; - fAvgResTime = ((double)duration_cast(system_clock::now() - tPoolConnTime).count()) - / iPoolCallTimes.size(); + fAvgResTime = ((double)duration_cast(system_clock::now() - tPoolConnTime).count()) / iPoolCallTimes.size(); } snprintf(buffer, sizeof(buffer), sHtmlResultBodyHigh, @@ -1126,7 +1122,7 @@ void executor::http_result_report(std::string& out) out.append(buffer); - for(size_t i=1; i < vMineResults.size(); i++) + for(size_t i = 1; i < vMineResults.size(); i++) { snprintf(buffer, sizeof(buffer), sHtmlResultTableRow, vMineResults[i].msg.c_str(), int_port(vMineResults[i].count), time_format(date, sizeof(date), vMineResults[i].time)); @@ -1143,7 +1139,7 @@ void executor::http_connection_report(std::string& out) out.reserve(4096); - snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Connection Report", ver_html, "Connection Report"); + snprintf(buffer, sizeof(buffer), sHtmlCommonHeader, "Connection Report", ver_html, "Connection Report"); out.append(buffer); jpsock* pool = pick_pool_by_id(current_pool_id); @@ -1151,16 +1147,16 @@ void executor::http_connection_report(std::string& out) pool = pick_pool_by_id(last_usr_pool_id); const char* cdate = "not connected"; - if (pool != nullptr && pool->is_running() && pool->is_logged_in()) + if(pool != nullptr && pool->is_running() && pool->is_logged_in()) cdate = time_format(date, sizeof(date), tPoolConnTime); size_t n_calls = iPoolCallTimes.size(); unsigned int ping_time = 0; - if (n_calls > 1) + if(n_calls > 1) { //Not-really-but-good-enough median - std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls/2, iPoolCallTimes.end()); - ping_time = iPoolCallTimes[n_calls/2]; + std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls / 2, iPoolCallTimes.end()); + ping_time = iPoolCallTimes[n_calls / 2]; } snprintf(buffer, sizeof(buffer), sHtmlConnectionBodyHigh, @@ -1169,8 +1165,7 @@ void executor::http_connection_report(std::string& out) cdate, ping_time); out.append(buffer); - - for(size_t i=0; i < vSocketLog.size(); i++) + for(size_t i = 0; i < vSocketLog.size(); i++) { snprintf(buffer, sizeof(buffer), sHtmlConnectionTableRow, time_format(date, sizeof(date), vSocketLog[i].time), vSocketLog[i].msg.c_str()); @@ -1199,12 +1194,13 @@ void executor::http_json_report(std::string& out) std::string hr_thds, res_error, cn_error; size_t nthd = pvThreads->size(); - double fTotal[3] = { 0.0, 0.0, 0.0}; + double fTotal[3] = {0.0, 0.0, 0.0}; hr_thds.reserve(nthd * 32); - for(size_t i=0; i < nthd; i++) + for(size_t i = 0; i < nthd; i++) { - if(i != 0) hr_thds.append(1, ','); + if(i != 0) + hr_thds.append(1, ','); double fHps[3]; fHps[0] = telem->calc_telemetry_data(10000, i); @@ -1232,7 +1228,7 @@ void executor::http_json_report(std::string& out) size_t iGoodRes = vMineResults[0].count, iTotalRes = iGoodRes; size_t ln = vMineResults.size(); - for(size_t i=1; i < ln; i++) + for(size_t i = 1; i < ln; i++) iTotalRes += vMineResults[i].count; jpsock* pool = pick_pool_by_id(current_pool_id); @@ -1252,10 +1248,11 @@ void executor::http_json_report(std::string& out) char buffer[2048]; res_error.reserve((vMineResults.size() - 1) * 128); - for(size_t i=1; i < vMineResults.size(); i++) + for(size_t i = 1; i < vMineResults.size(); i++) { using namespace std::chrono; - if(i != 1) res_error.append(1, ','); + if(i != 1) + res_error.append(1, ','); snprintf(buffer, sizeof(buffer), sJsonApiResultError, int_port(vMineResults[i].count), int_port(duration_cast(vMineResults[i].time.time_since_epoch()).count()), @@ -1265,18 +1262,19 @@ void executor::http_json_report(std::string& out) size_t n_calls = iPoolCallTimes.size(); size_t iPoolPing = 0; - if (n_calls > 1) + if(n_calls > 1) { //Not-really-but-good-enough median - std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls/2, iPoolCallTimes.end()); - iPoolPing = iPoolCallTimes[n_calls/2]; + std::nth_element(iPoolCallTimes.begin(), iPoolCallTimes.begin() + n_calls / 2, iPoolCallTimes.end()); + iPoolPing = iPoolCallTimes[n_calls / 2]; } cn_error.reserve(vSocketLog.size() * 256); - for(size_t i=0; i < vSocketLog.size(); i++) + for(size_t i = 0; i < vSocketLog.size(); i++) { using namespace std::chrono; - if(i != 0) cn_error.append(1, ','); + if(i != 0) + cn_error.append(1, ','); snprintf(buffer, sizeof(buffer), sJsonApiConnectionError, int_port(duration_cast(vSocketLog[i].time.time_since_epoch()).count()), @@ -1285,7 +1283,7 @@ void executor::http_json_report(std::string& out) } size_t bb_size = 2048 + hr_thds.size() + res_error.size() + cn_error.size(); - std::unique_ptr bigbuf( new char[ bb_size ] ); + std::unique_ptr bigbuf(new char[bb_size]); int bb_len = snprintf(bigbuf.get(), bb_size, sJsonApiFormat, get_version_str().c_str(), hr_thds.c_str(), hr_buffer, a, @@ -1332,8 +1330,7 @@ void executor::get_http_report(ex_event_name ev_id, std::string& data) std::lock_guard lck(httpMutex); assert(pHttpString == nullptr); - assert(ev_id == EV_HTML_HASHRATE || ev_id == EV_HTML_RESULTS - || ev_id == EV_HTML_CONNSTAT || ev_id == EV_HTML_JSON); + assert(ev_id == EV_HTML_HASHRATE || ev_id == EV_HTML_RESULTS || ev_id == EV_HTML_CONNSTAT || ev_id == EV_HTML_JSON); pHttpString = &data; httpReady = std::promise(); diff --git a/xmrstak/misc/executor.hpp b/xmrstak/misc/executor.hpp index be5ee6c2f..47359afc2 100644 --- a/xmrstak/misc/executor.hpp +++ b/xmrstak/misc/executor.hpp @@ -1,18 +1,18 @@ #pragma once -#include "thdq.hpp" #include "telemetry.hpp" +#include "thdq.hpp" #include "xmrstak/backend/iBackend.hpp" +#include "xmrstak/donate-level.hpp" #include "xmrstak/misc/environment.hpp" #include "xmrstak/net/msgstruct.hpp" -#include "xmrstak/donate-level.hpp" -#include #include +#include +#include +#include #include #include -#include -#include class jpsock; @@ -27,7 +27,7 @@ class minethd; class executor { -public: + public: static executor* inst() { auto& env = xmrstak::environment::inst(); @@ -43,13 +43,15 @@ class executor inline void push_event(ex_event&& ev) { oEventQ.push(std::move(ev)); } void push_timed_event(ex_event&& ev, size_t sec); -private: + private: struct timed_event { ex_event event; size_t ticks_left; - timed_event(ex_event&& ev, size_t ticks) : event(std::move(ev)), ticks_left(ticks) {} + timed_event(ex_event&& ev, size_t ticks) : + event(std::move(ev)), + ticks_left(ticks) {} }; inline void set_timestamp() { dev_timestamp = get_timestamp(); }; @@ -119,7 +121,8 @@ class executor std::chrono::system_clock::time_point time; std::string msg; - sck_error_log(std::string&& err) : msg(std::move(err)) + sck_error_log(std::string&& err) : + msg(std::move(err)) { time = std::chrono::system_clock::now(); } @@ -134,12 +137,16 @@ class executor std::string msg; size_t count; - result_tally() : msg("[OK]"), count(0) + result_tally() : + msg("[OK]"), + count(0) { time = std::chrono::system_clock::now(); } - result_tally(std::string&& err) : msg(std::move(err)), count(1) + result_tally(std::string&& err) : + msg(std::move(err)), + count(1) { time = std::chrono::system_clock::now(); } @@ -161,7 +168,7 @@ class executor std::vector vMineResults; //More result statistics - std::array iTopDiff { { } }; //Initialize to zero + std::array iTopDiff{{}}; //Initialize to zero std::chrono::system_clock::time_point tPoolConnTime; size_t iPoolHashes = 0; @@ -195,4 +202,3 @@ class executor inline size_t sec_to_ticks(size_t sec) { return sec * (1000 / iTickTime); } }; - diff --git a/xmrstak/misc/home_dir.hpp b/xmrstak/misc/home_dir.hpp new file mode 100644 index 000000000..836c7cc4e --- /dev/null +++ b/xmrstak/misc/home_dir.hpp @@ -0,0 +1,43 @@ +#pragma once + +#include + +#ifdef _WIN32 +#include +// this comment avoid that clang format reorders the includes +#include + +namespace +{ +inline std::string get_home() +{ + char path[MAX_PATH + 1]; + // get folder "appdata\local" + if(SHGetSpecialFolderPathA(HWND_DESKTOP, path, CSIDL_LOCAL_APPDATA, FALSE)) + { + return path; + } + else + return "."; +} +} // namespace + +#else +#include +#include +#include + +namespace +{ +inline std::string get_home() +{ + const char* home = "."; + + if((home = getenv("HOME")) == nullptr) + home = getpwuid(getuid())->pw_dir; + + return home; +} +} // namespace + +#endif // _WIN32 diff --git a/xmrstak/misc/jext.hpp b/xmrstak/misc/jext.hpp index 9936fa813..421508989 100644 --- a/xmrstak/misc/jext.hpp +++ b/xmrstak/misc/jext.hpp @@ -9,7 +9,7 @@ using namespace rapidjson; inline const Value* GetObjectMember(const Value& obj, const char* key) { Value::ConstMemberIterator itr = obj.FindMember(key); - if (itr != obj.MemberEnd()) + if(itr != obj.MemberEnd()) return &itr->value; else return nullptr; @@ -48,8 +48,8 @@ inline const Value* GetObjectMember(const Value& obj, const char* key) #elif defined(__NetBSD__) -#include #include +#include #if defined(__BSWAP_RENAME) && !defined(__bswap_32) #define bswap_32(x) bswap32(x) #define bswap_64(x) bswap64(x) diff --git a/xmrstak/misc/telemetry.cpp b/xmrstak/misc/telemetry.cpp index 47442df09..16ecaa6f6 100644 --- a/xmrstak/misc/telemetry.cpp +++ b/xmrstak/misc/telemetry.cpp @@ -24,9 +24,9 @@ #include "telemetry.hpp" #include "xmrstak/net/msgstruct.hpp" +#include #include #include -#include namespace xmrstak { @@ -38,7 +38,7 @@ telemetry::telemetry(size_t iThd) iBucketTop = new uint32_t[iThd]; mtx = new std::mutex[iThd]; - for (size_t i = 0; i < iThd; i++) + for(size_t i = 0; i < iThd; i++) { ppHashCounts[i] = new uint64_t[iBucketSize]; ppTimestamps[i] = new uint64_t[iBucketSize]; @@ -51,7 +51,6 @@ telemetry::telemetry(size_t iThd) double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread) { - uint64_t iEarliestHashCnt = 0; uint64_t iEarliestStamp = 0; uint64_t iLatestStamp = 0; @@ -62,20 +61,20 @@ double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread) uint64_t iTimeNow = get_timestamp_ms(); //Start at 1, buckettop points to next empty - for (size_t i = 1; i < iBucketSize; i++) + for(size_t i = 1; i < iBucketSize; i++) { size_t idx = (iBucketTop[iThread] - i) & iBucketMask; //overflow expected here - if (ppTimestamps[iThread][idx] == 0) + if(ppTimestamps[iThread][idx] == 0) break; //That means we don't have the data yet - if (iLatestStamp == 0) + if(iLatestStamp == 0) { iLatestStamp = ppTimestamps[iThread][idx]; iLatestHashCnt = ppHashCounts[iThread][idx]; } - if (iTimeNow - ppTimestamps[iThread][idx] > iLastMillisec) + if(iTimeNow - ppTimestamps[iThread][idx] > iLastMillisec) { bHaveFullSet = true; break; //We are out of the requested time period @@ -86,11 +85,11 @@ double telemetry::calc_telemetry_data(size_t iLastMillisec, size_t iThread) } lk.unlock(); - if (!bHaveFullSet || iEarliestStamp == 0 || iLatestStamp == 0) + if(!bHaveFullSet || iEarliestStamp == 0 || iLatestStamp == 0) return nan(""); //Don't think that can happen, but just in case - if (iLatestStamp - iEarliestStamp == 0) + if(iLatestStamp - iEarliestStamp == 0) return nan(""); double fHashes, fTime; diff --git a/xmrstak/misc/telemetry.hpp b/xmrstak/misc/telemetry.hpp index 580565de2..2ab2a9e5f 100644 --- a/xmrstak/misc/telemetry.hpp +++ b/xmrstak/misc/telemetry.hpp @@ -9,12 +9,12 @@ namespace xmrstak class telemetry { -public: + public: telemetry(size_t iThd); void push_perf_value(size_t iThd, uint64_t iHashCount, uint64_t iTimestamp); double calc_telemetry_data(size_t iLastMillisec, size_t iThread); -private: + private: std::mutex* mtx; constexpr static size_t iBucketSize = 2 << 11; //Power of 2 to simplify calculations constexpr static size_t iBucketMask = iBucketSize - 1; diff --git a/xmrstak/misc/thdq.hpp b/xmrstak/misc/thdq.hpp index 7a4a5cfe4..2eef30bcf 100644 --- a/xmrstak/misc/thdq.hpp +++ b/xmrstak/misc/thdq.hpp @@ -1,31 +1,37 @@ #pragma once - + +#include +#include #include #include -#include -#include - + template class thdq { -public: + public: T pop() { std::unique_lock mlock(mutex_); - while (queue_.empty()) { cond_.wait(mlock); } + while(queue_.empty()) + { + cond_.wait(mlock); + } auto item = std::move(queue_.front()); queue_.pop(); return item; } - + void pop(T& item) { std::unique_lock mlock(mutex_); - while (queue_.empty()) { cond_.wait(mlock); } + while(queue_.empty()) + { + cond_.wait(mlock); + } item = queue_.front(); queue_.pop(); } - + void push(const T& item) { std::unique_lock mlock(mutex_); @@ -33,7 +39,7 @@ class thdq mlock.unlock(); cond_.notify_one(); } - + void push(T&& item) { std::unique_lock mlock(mutex_); @@ -41,9 +47,9 @@ class thdq mlock.unlock(); cond_.notify_one(); } - -private: + + private: std::queue queue_; std::mutex mutex_; std::condition_variable cond_; -}; +}; diff --git a/xmrstak/misc/uac.cpp b/xmrstak/misc/uac.cpp index 9f338dde0..65d5db8f5 100644 --- a/xmrstak/misc/uac.cpp +++ b/xmrstak/misc/uac.cpp @@ -9,14 +9,14 @@ BOOL IsElevated() { BOOL fRet = FALSE; HANDLE hToken = NULL; - if (OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken)) + if(OpenProcessToken(GetCurrentProcess(), TOKEN_QUERY, &hToken)) { TOKEN_ELEVATION Elevation; DWORD cbSize = sizeof(TOKEN_ELEVATION); - if (GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize)) + if(GetTokenInformation(hToken, TokenElevation, &Elevation, sizeof(Elevation), &cbSize)) fRet = Elevation.TokenIsElevated; } - if (hToken) + if(hToken) CloseHandle(hToken); return fRet; } @@ -24,10 +24,10 @@ BOOL IsElevated() BOOL SelfElevate(const std::string& my_path, const std::string& params) { using namespace xmrstak; - if (IsElevated()) + if(IsElevated()) return FALSE; - SHELLEXECUTEINFO shExecInfo = { 0 }; + SHELLEXECUTEINFO shExecInfo = {0}; shExecInfo.cbSize = sizeof(SHELLEXECUTEINFO); shExecInfo.fMask = SEE_MASK_NOCLOSEPROCESS; shExecInfo.hwnd = NULL; @@ -41,7 +41,7 @@ BOOL SelfElevate(const std::string& my_path, const std::string& params) shExecInfo.nShow = SW_HIDE; } - if (!ShellExecuteEx(&shExecInfo)) + if(!ShellExecuteEx(&shExecInfo)) return FALSE; // Loiter in the background to make scripting easier @@ -69,13 +69,13 @@ VOID RequestElevation() BOOL IsWindows10OrNewer() { - OSVERSIONINFOEX osvi = { 0 }; - osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX); - osvi.dwMajorVersion = 10; - osvi.dwMinorVersion = 0; - DWORDLONG dwlConditionMask = 0; - VER_SET_CONDITION(dwlConditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL); - VER_SET_CONDITION(dwlConditionMask, VER_MINORVERSION, VER_GREATER_EQUAL); - return ::VerifyVersionInfo(&osvi, VER_MAJORVERSION | VER_MINORVERSION, dwlConditionMask); + OSVERSIONINFOEX osvi = {0}; + osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX); + osvi.dwMajorVersion = 10; + osvi.dwMinorVersion = 0; + DWORDLONG dwlConditionMask = 0; + VER_SET_CONDITION(dwlConditionMask, VER_MAJORVERSION, VER_GREATER_EQUAL); + VER_SET_CONDITION(dwlConditionMask, VER_MINORVERSION, VER_GREATER_EQUAL); + return ::VerifyVersionInfo(&osvi, VER_MAJORVERSION | VER_MINORVERSION, dwlConditionMask); } #endif diff --git a/xmrstak/misc/utility.cpp b/xmrstak/misc/utility.cpp index 5177d14c2..bf665bda3 100644 --- a/xmrstak/misc/utility.cpp +++ b/xmrstak/misc/utility.cpp @@ -1,21 +1,15 @@ -#include #include - +#include namespace xmrstak { - bool strcmp_i(const std::string& str1, const std::string& str2) - { - if(str1.size() != str2.size()) - return false; - else - return (str1.empty() | str2.empty()) ? - false : - std::equal(str1.begin(), str1.end(),str2.begin(), - [](char c1, char c2) - { - return ::tolower(c1) == ::tolower(c2); - } - ); - } +bool strcmp_i(const std::string& str1, const std::string& str2) +{ + if(str1.size() != str2.size()) + return false; + else + return (str1.empty() | str2.empty()) ? false : std::equal(str1.begin(), str1.end(), str2.begin(), [](char c1, char c2) { + return ::tolower(c1) == ::tolower(c2); + }); +} } // namespace xmrstak diff --git a/xmrstak/misc/utility.hpp b/xmrstak/misc/utility.hpp index 8f2e99fb8..0eb08993d 100644 --- a/xmrstak/misc/utility.hpp +++ b/xmrstak/misc/utility.hpp @@ -4,9 +4,9 @@ namespace xmrstak { - /** case insensitive string compare +/** case insensitive string compare * * @return true if both strings are equal, else false */ - bool strcmp_i(const std::string& str1, const std::string& str2); +bool strcmp_i(const std::string& str1, const std::string& str2); } // namespace xmrstak diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp index 786b18b4f..f9522962f 100644 --- a/xmrstak/net/jpsock.cpp +++ b/xmrstak/net/jpsock.cpp @@ -21,17 +21,17 @@ * */ -#include -#include #include +#include #include +#include #include "jpsock.hpp" -#include "socks.hpp" #include "socket.hpp" +#include "socks.hpp" -#include "xmrstak/misc/executor.hpp" #include "xmrstak/jconf.hpp" +#include "xmrstak/misc/executor.hpp" #include "xmrstak/misc/jext.hpp" #include "xmrstak/version.hpp" @@ -45,7 +45,9 @@ struct jpsock::call_rsp std::string sCallErr; uint64_t iMessageId; - call_rsp(Value* val) : pCallData(val), iMessageId(0) + call_rsp(Value* val) : + pCallData(val), + iMessageId(0) { bHaveResponse = false; iCallId = 0; @@ -70,7 +72,7 @@ typedef GenericDocument, MemoryPoolAllocator<>, MemoryPoolAllocator<>> Me struct jpsock::opaque_private { - Value oCallValue; + Value oCallValue; MemoryPoolAllocator<> callAllocator; MemoryPoolAllocator<> recvAllocator; @@ -91,12 +93,24 @@ struct jpsock::opaque_private struct jpsock::opq_json_val { const Value* val; - opq_json_val(const Value* val) : val(val) {} + opq_json_val(const Value* val) : + val(val) {} }; jpsock::jpsock(size_t id, const char* sAddr, const char* sLogin, const char* sRigId, const char* sPassword, double pool_weight, bool dev_pool, bool tls, const char* tls_fp, bool nicehash) : - net_addr(sAddr), usr_login(sLogin), usr_rigid(sRigId), usr_pass(sPassword), tls_fp(tls_fp), pool_id(id), pool_weight(pool_weight), pool(dev_pool), nicehash(nicehash), - connect_time(0), connect_attempts(0), disconnect_time(0), quiet_close(false) + net_addr(sAddr), + usr_login(sLogin), + usr_rigid(sRigId), + usr_pass(sPassword), + tls_fp(tls_fp), + pool_id(id), + pool_weight(pool_weight), + pool(dev_pool), + nicehash(nicehash), + connect_time(0), + connect_attempts(0), + disconnect_time(0), + quiet_close(false) { sock_init(); @@ -245,7 +259,7 @@ bool jpsock::jpsock_thd_main() char buf[iSockBufferSize]; size_t datalen = 0; - while (true) + while(true) { int ret = sck->recv(buf + datalen, sizeof(buf) - datalen); @@ -254,7 +268,7 @@ bool jpsock::jpsock_thd_main() datalen += ret; - if (datalen >= sizeof(buf)) + if(datalen >= sizeof(buf)) { sck->close(false); return set_socket_error("RECEIVE error: data overflow"); @@ -262,12 +276,12 @@ bool jpsock::jpsock_thd_main() char* lnend; char* lnstart = buf; - while ((lnend = (char*)memchr(lnstart, '\n', datalen)) != nullptr) + while((lnend = (char*)memchr(lnstart, '\n', datalen)) != nullptr) { lnend++; int lnlen = lnend - lnstart; - if (!process_line(lnstart, lnlen)) + if(!process_line(lnstart, lnlen)) { sck->close(false); return false; @@ -278,7 +292,7 @@ bool jpsock::jpsock_thd_main() } //Got leftover data? Move it to the front - if (datalen > 0 && buf != lnstart) + if(datalen > 0 && buf != lnstart) memmove(buf, lnstart, datalen); } } @@ -291,18 +305,18 @@ bool jpsock::process_line(char* line, size_t len) ++iMessageCnt; /*NULL terminate the line instead of '\n', parsing will add some more NULLs*/ - line[len-1] = '\0'; + line[len - 1] = '\0'; //printf("RECV: %s\n", line); - if (prv->jsonDoc.ParseInsitu(line).HasParseError()) + if(prv->jsonDoc.ParseInsitu(line).HasParseError()) return set_socket_error("PARSE error: Invalid JSON"); - if (!prv->jsonDoc.IsObject()) + if(!prv->jsonDoc.IsObject()) return set_socket_error("PARSE error: Invalid root"); const Value* mt; - if (prv->jsonDoc.HasMember("method")) + if(prv->jsonDoc.HasMember("method")) { mt = GetObjectMember(prv->jsonDoc, "method"); @@ -329,7 +343,7 @@ bool jpsock::process_line(char* line, size_t len) { uint64_t iCallId; mt = GetObjectMember(prv->jsonDoc, "id"); - if (mt == nullptr || !mt->IsUint64()) + if(mt == nullptr || !mt->IsUint64()) return set_socket_error("PARSE error: Protocol error 3"); iCallId = mt->GetUint64(); @@ -338,10 +352,10 @@ bool jpsock::process_line(char* line, size_t len) const char* sError = nullptr; size_t iErrorLen = 0; - if (mt == nullptr || mt->IsNull()) + if(mt == nullptr || mt->IsNull()) { /* If there was no error we need a result */ - if ((mt = GetObjectMember(prv->jsonDoc, "result")) == nullptr) + if((mt = GetObjectMember(prv->jsonDoc, "result")) == nullptr) return set_socket_error("PARSE error: Protocol error 7"); } else @@ -359,7 +373,7 @@ bool jpsock::process_line(char* line, size_t len) } std::unique_lock mlock(call_mutex); - if (prv->oCallRsp.pCallData == nullptr) + if(prv->oCallRsp.pCallData == nullptr) { /*Server sent us a call reply without us making a call*/ mlock.unlock(); @@ -400,7 +414,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message mlock.unlock(); - if (!params->val->IsObject()) + if(!params->val->IsObject()) return set_socket_error("PARSE error: Job error 1"); const Value *blob, *jobid, *target, *motd, *blk_height; @@ -410,7 +424,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message motd = GetObjectMember(*params->val, "motd"); blk_height = GetObjectMember(*params->val, "height"); - if (jobid == nullptr || blob == nullptr || target == nullptr || + if(jobid == nullptr || blob == nullptr || target == nullptr || !jobid->IsString() || !blob->IsString() || !target->IsString()) { return set_socket_error("PARSE error: Job error 2"); @@ -421,7 +435,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message std::unique_lock lck(motd_mutex); if(motd->GetStringLength() > 0) { - pool_motd.resize(motd->GetStringLength()/2 + 1); + pool_motd.resize(motd->GetStringLength() / 2 + 1); if(!hex2bin(motd->GetString(), motd->GetStringLength(), (unsigned char*)&pool_motd.front())) pool_motd.clear(); } @@ -429,7 +443,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message pool_motd.clear(); } - if (jobid->GetStringLength() >= sizeof(pool_job::sJobID)) // Note >= + if(jobid->GetStringLength() >= sizeof(pool_job::sJobID)) // Note >= return set_socket_error("PARSE error: Job error 3"); pool_job oPoolJob; @@ -437,10 +451,10 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message const uint32_t iWorkLen = blob->GetStringLength() / 2; oPoolJob.iWorkLen = iWorkLen; - if (iWorkLen > sizeof(pool_job::bWorkBlob)) + if(iWorkLen > sizeof(pool_job::bWorkBlob)) return set_socket_error("PARSE error: Invalid job length. Are you sure you are mining the correct coin?"); - if (!hex2bin(blob->GetString(), iWorkLen * 2, oPoolJob.bWorkBlob)) + if(!hex2bin(blob->GetString(), iWorkLen * 2, oPoolJob.bWorkBlob)) return set_socket_error("PARSE error: Job error 4"); // lock reading of oCurrentJob @@ -479,7 +493,7 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message return set_socket_error("PARSE error: Job error 5"); iJobDiff = t64_to_diff(oPoolJob.iTarget); - + if(blk_height != nullptr && blk_height->IsUint64()) oPoolJob.iBlockHeight = bswap_64(blk_height->GetUint64()); @@ -589,10 +603,10 @@ bool jpsock::cmd_login() uint64_t messageId = 0; /*Normal error conditions (failed login etc..) will end here*/ - if (!cmd_ret_wait(cmd_buffer, oResult, messageId)) + if(!cmd_ret_wait(cmd_buffer, oResult, messageId)) return false; - if (!oResult.val->IsObject()) + if(!oResult.val->IsObject()) { set_socket_error("PARSE error: Login protocol error 1"); disconnect(); @@ -603,14 +617,14 @@ bool jpsock::cmd_login() const Value* job = GetObjectMember(*oResult.val, "job"); const Value* ext = GetObjectMember(*oResult.val, "extensions"); - if (id == nullptr || job == nullptr || !id->IsString()) + if(id == nullptr || job == nullptr || !id->IsString()) { set_socket_error("PARSE error: Login protocol error 2"); disconnect(); return false; } - if (id->GetStringLength() >= sizeof(sMinerId)) + if(id->GetStringLength() >= sizeof(sMinerId)) { set_socket_error("PARSE error: Login protocol error 3"); disconnect(); @@ -622,7 +636,7 @@ bool jpsock::cmd_login() if(ext != nullptr && ext->IsArray()) { - for(size_t i=0; i < ext->Size(); i++) + for(size_t i = 0; i < ext->Size(); i++) { const Value& jextname = ext->GetArray()[i]; @@ -693,7 +707,7 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes sResult[64] = '\0'; snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s%s%s%s%s},\"id\":1}\n", - sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sIterations,sMemory, sMemAlignBytes); + sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sIterations, sMemory, sMemAlignBytes); uint64_t messageId = 0; opq_json_val oResult(nullptr); @@ -732,13 +746,13 @@ bool jpsock::get_pool_motd(std::string& strin) return false; } -inline unsigned char hf_hex2bin(char c, bool &err) +inline unsigned char hf_hex2bin(char c, bool& err) { - if (c >= '0' && c <= '9') + if(c >= '0' && c <= '9') return c - '0'; - else if (c >= 'a' && c <= 'f') + else if(c >= 'a' && c <= 'f') return c - 'a' + 0xA; - else if (c >= 'A' && c <= 'F') + else if(c >= 'A' && c <= 'F') return c - 'A' + 0xA; err = true; @@ -748,17 +762,18 @@ inline unsigned char hf_hex2bin(char c, bool &err) bool jpsock::hex2bin(const char* in, unsigned int len, unsigned char* out) { bool error = false; - for (unsigned int i = 0; i < len; i += 2) + for(unsigned int i = 0; i < len; i += 2) { out[i / 2] = (hf_hex2bin(in[i], error) << 4) | hf_hex2bin(in[i + 1], error); - if (error) return false; + if(error) + return false; } return true; } inline char hf_bin2hex(unsigned char c) { - if (c <= 0x9) + if(c <= 0x9) return '0' + c; else return 'a' - 0xA + c; @@ -766,7 +781,7 @@ inline char hf_bin2hex(unsigned char c) void jpsock::bin2hex(const unsigned char* in, unsigned int len, char* out) { - for (unsigned int i = 0; i < len; i++) + for(unsigned int i = 0; i < len; i++) { out[i * 2] = hf_bin2hex((in[i] & 0xF0) >> 4); out[i * 2 + 1] = hf_bin2hex(in[i] & 0x0F); diff --git a/xmrstak/net/jpsock.hpp b/xmrstak/net/jpsock.hpp index 949764813..4ad6ebbbc 100644 --- a/xmrstak/net/jpsock.hpp +++ b/xmrstak/net/jpsock.hpp @@ -1,15 +1,14 @@ #pragma once -#include "xmrstak/backend/iBackend.hpp" #include "msgstruct.hpp" +#include "xmrstak/backend/iBackend.hpp" #include "xmrstak/jconf.hpp" -#include #include #include -#include +#include #include - +#include /* Our pool can have two kinds of errors: - Parsing or connection error @@ -27,7 +26,7 @@ class base_socket; class jpsock { -public: + public: jpsock(size_t id, const char* sAddr, const char* sLogin, const char* sRigId, const char* sPassword, double pool_weight, bool dev_pool, bool tls, const char* tls_fp, bool nicehash); ~jpsock(); @@ -55,7 +54,12 @@ class jpsock inline bool is_logged_in() { return bLoggedIn; } inline bool is_dev_pool() { return pool; } inline size_t get_pool_id() { return pool_id; } - inline bool get_disconnects(size_t& att, size_t& time) { att = connect_attempts; time = disconnect_time != 0 ? get_timestamp() - disconnect_time + 1 : 0; return pool && usr_login[0]; } + inline bool get_disconnects(size_t& att, size_t& time) + { + att = connect_attempts; + time = disconnect_time != 0 ? get_timestamp() - disconnect_time + 1 : 0; + return pool && usr_login[0]; + } inline const char* get_pool_addr() { return net_addr.c_str(); } inline const char* get_tls_fp() { return tls_fp.c_str(); } inline const char* get_rigid() { return usr_rigid.c_str(); } @@ -77,7 +81,7 @@ class jpsock bool set_socket_error_strerr(const char* a); bool set_socket_error_strerr(const char* a, int res); -private: + private: std::string net_addr; std::string usr_login; std::string usr_rigid; @@ -142,4 +146,3 @@ class jpsock uint64_t iMessageCnt = 0; uint64_t iLastMessageId = 0; }; - diff --git a/xmrstak/net/msgstruct.hpp b/xmrstak/net/msgstruct.hpp index 33980bf42..3cfce3c6f 100644 --- a/xmrstak/net/msgstruct.hpp +++ b/xmrstak/net/msgstruct.hpp @@ -2,25 +2,29 @@ #include "xmrstak/backend/cryptonight.hpp" -#include -#include #include +#include +#include // Structures that we use to pass info between threads constructors are here just to make // the stack allocation take up less space, heap is a shared resource that needs locks too of course struct pool_job { - char sJobID[64]; - uint8_t bWorkBlob[128]; - uint64_t iTarget; - uint32_t iWorkLen; - uint32_t iSavedNonce; - uint64_t iBlockHeight = uint64_t(-1); - - pool_job() : iWorkLen(0), iSavedNonce(0) {} + char sJobID[64]; + uint8_t bWorkBlob[128]; + uint64_t iTarget; + uint32_t iWorkLen; + uint32_t iSavedNonce; + uint64_t iBlockHeight = uint64_t(-1); + + pool_job() : + iWorkLen(0), + iSavedNonce(0) {} pool_job(const char* sJobID, uint64_t iTarget, const uint8_t* bWorkBlob, uint32_t iWorkLen) : - iTarget(iTarget), iWorkLen(iWorkLen), iSavedNonce(0) + iTarget(iTarget), + iWorkLen(iWorkLen), + iSavedNonce(0) { assert(iWorkLen <= sizeof(pool_job::bWorkBlob)); memcpy(this->sJobID, sJobID, sizeof(pool_job::sJobID)); @@ -30,15 +34,17 @@ struct pool_job struct job_result { - uint8_t bResult[32]; - char sJobID[64]; - uint32_t iNonce; - uint32_t iThreadId; + uint8_t bResult[32]; + char sJobID[64]; + uint32_t iNonce; + uint32_t iThreadId; xmrstak_algo algorithm = {invalid_algo}; job_result() {} job_result(const char* sJobID, uint32_t iNonce, const uint8_t* bResult, uint32_t iThreadId, const xmrstak_algo& algo) : - iNonce(iNonce), iThreadId(iThreadId), algorithm(algo) + iNonce(iNonce), + iThreadId(iThreadId), + algorithm(algo) { memcpy(this->sJobID, sJobID, sizeof(job_result::sJobID)); memcpy(this->bResult, bResult, sizeof(job_result::bResult)); @@ -51,8 +57,12 @@ struct sock_err bool silent; sock_err() {} - sock_err(std::string&& err, bool silent) : sSocketError(std::move(err)), silent(silent) { } - sock_err(sock_err&& from) : sSocketError(std::move(from.sSocketError)), silent(from.silent) {} + sock_err(std::string&& err, bool silent) : + sSocketError(std::move(err)), + silent(silent) {} + sock_err(sock_err&& from) : + sSocketError(std::move(from.sSocketError)), + silent(from.silent) {} sock_err& operator=(sock_err&& from) { @@ -62,7 +72,7 @@ struct sock_err return *this; } - ~sock_err() { } + ~sock_err() {} sock_err(sock_err const&) = delete; sock_err& operator=(sock_err const&) = delete; @@ -73,13 +83,30 @@ struct gpu_res_err { size_t idx; // GPU index const char* error_str; - gpu_res_err(const char* error_str, size_t idx) : error_str(error_str), idx(idx) {} + gpu_res_err(const char* error_str, size_t idx) : + error_str(error_str), + idx(idx) {} }; -enum ex_event_name { EV_INVALID_VAL, EV_SOCK_READY, EV_SOCK_ERROR, EV_GPU_RES_ERROR, - EV_POOL_HAVE_JOB, EV_MINER_HAVE_RESULT, EV_PERF_TICK, EV_EVAL_POOL_CHOICE, - EV_USR_HASHRATE, EV_USR_RESULTS, EV_USR_CONNSTAT, EV_HASHRATE_LOOP, - EV_HTML_HASHRATE, EV_HTML_RESULTS, EV_HTML_CONNSTAT, EV_HTML_JSON }; +enum ex_event_name +{ + EV_INVALID_VAL, + EV_SOCK_READY, + EV_SOCK_ERROR, + EV_GPU_RES_ERROR, + EV_POOL_HAVE_JOB, + EV_MINER_HAVE_RESULT, + EV_PERF_TICK, + EV_EVAL_POOL_CHOICE, + EV_USR_HASHRATE, + EV_USR_RESULTS, + EV_USR_CONNSTAT, + EV_HASHRATE_LOOP, + EV_HTML_HASHRATE, + EV_HTML_RESULTS, + EV_HTML_CONNSTAT, + EV_HTML_JSON +}; /* This is how I learned to stop worrying and love c++11 =). @@ -96,20 +123,37 @@ struct ex_event ex_event_name iName; size_t iPoolId; - union - { + union { pool_job oPoolJob; job_result oJobResult; sock_err oSocketError; gpu_res_err oGpuError; }; - ex_event() { iName = EV_INVALID_VAL; iPoolId = 0;} - ex_event(const char* gpu_err, size_t gpu_idx, size_t id) : iName(EV_GPU_RES_ERROR), iPoolId(id), oGpuError(gpu_err, gpu_idx) {} - ex_event(std::string&& err, bool silent, size_t id) : iName(EV_SOCK_ERROR), iPoolId(id), oSocketError(std::move(err), silent) { } - ex_event(job_result dat, size_t id) : iName(EV_MINER_HAVE_RESULT), iPoolId(id), oJobResult(dat) {} - ex_event(pool_job dat, size_t id) : iName(EV_POOL_HAVE_JOB), iPoolId(id), oPoolJob(dat) {} - ex_event(ex_event_name ev, size_t id = 0) : iName(ev), iPoolId(id) {} + ex_event() + { + iName = EV_INVALID_VAL; + iPoolId = 0; + } + ex_event(const char* gpu_err, size_t gpu_idx, size_t id) : + iName(EV_GPU_RES_ERROR), + iPoolId(id), + oGpuError(gpu_err, gpu_idx) {} + ex_event(std::string&& err, bool silent, size_t id) : + iName(EV_SOCK_ERROR), + iPoolId(id), + oSocketError(std::move(err), silent) {} + ex_event(job_result dat, size_t id) : + iName(EV_MINER_HAVE_RESULT), + iPoolId(id), + oJobResult(dat) {} + ex_event(pool_job dat, size_t id) : + iName(EV_POOL_HAVE_JOB), + iPoolId(id), + oPoolJob(dat) {} + ex_event(ex_event_name ev, size_t id = 0) : + iName(ev), + iPoolId(id) {} // Delete the copy operators to make sure we are moving only what is needed ex_event(ex_event const&) = delete; @@ -123,7 +167,7 @@ struct ex_event switch(iName) { case EV_SOCK_ERROR: - new (&oSocketError) sock_err(std::move(from.oSocketError)); + new(&oSocketError) sock_err(std::move(from.oSocketError)); break; case EV_MINER_HAVE_RESULT: oJobResult = from.oJobResult; @@ -151,7 +195,7 @@ struct ex_event switch(iName) { case EV_SOCK_ERROR: - new (&oSocketError) sock_err(); + new(&oSocketError) sock_err(); oSocketError = std::move(from.oSocketError); break; case EV_MINER_HAVE_RESULT: diff --git a/xmrstak/net/socket.cpp b/xmrstak/net/socket.cpp index 6fcb454cd..6a6abac15 100644 --- a/xmrstak/net/socket.cpp +++ b/xmrstak/net/socket.cpp @@ -28,16 +28,17 @@ #include "xmrstak/misc/executor.hpp" #ifndef CONF_NO_TLS -#include #include #include +#include #ifndef OPENSSL_THREADS #error OpenSSL was compiled without thread support #endif #endif -plain_socket::plain_socket(jpsock* err_callback) : pCallback(err_callback) +plain_socket::plain_socket(jpsock* err_callback) : + pCallback(err_callback) { hSocket = INVALID_SOCKET; pSockAddr = nullptr; @@ -50,58 +51,58 @@ bool plain_socket::set_hostname(const char* sAddr) sock_closed = false; size_t ln = strlen(sAddr); - if (ln >= sizeof(sAddrMb)) + if(ln >= sizeof(sAddrMb)) return pCallback->set_socket_error("CONNECT error: Pool address overflow."); memcpy(sAddrMb, sAddr, ln); sAddrMb[ln] = '\0'; - if ((sTmp = strstr(sAddrMb, "//")) != nullptr) + if((sTmp = strstr(sAddrMb, "//")) != nullptr) { sTmp += 2; memmove(sAddrMb, sTmp, strlen(sTmp) + 1); } - if ((sPort = strchr(sAddrMb, ':')) == nullptr) + if((sPort = strchr(sAddrMb, ':')) == nullptr) return pCallback->set_socket_error("CONNECT error: Pool port number not specified, please use format :."); sPort[0] = '\0'; sPort++; - addrinfo hints = { 0 }; + addrinfo hints = {0}; hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_protocol = IPPROTO_TCP; pAddrRoot = nullptr; int err; - if ((err = getaddrinfo(sAddrMb, sPort, &hints, &pAddrRoot)) != 0) + if((err = getaddrinfo(sAddrMb, sPort, &hints, &pAddrRoot)) != 0) return pCallback->set_socket_error_strerr("CONNECT error: GetAddrInfo: ", err); - addrinfo *ptr = pAddrRoot; + addrinfo* ptr = pAddrRoot; std::vector ipv4; std::vector ipv6; - while (ptr != nullptr) + while(ptr != nullptr) { - if (ptr->ai_family == AF_INET) + if(ptr->ai_family == AF_INET) ipv4.push_back(ptr); - if (ptr->ai_family == AF_INET6) + if(ptr->ai_family == AF_INET6) ipv6.push_back(ptr); ptr = ptr->ai_next; } - if (ipv4.empty() && ipv6.empty()) + if(ipv4.empty() && ipv6.empty()) { freeaddrinfo(pAddrRoot); pAddrRoot = nullptr; return pCallback->set_socket_error("CONNECT error: I found some DNS records but no IPv4 or IPv6 addresses."); } - else if (!ipv4.empty() && ipv6.empty()) + else if(!ipv4.empty() && ipv6.empty()) pSockAddr = ipv4[rand() % ipv4.size()]; - else if (ipv4.empty() && !ipv6.empty()) + else if(ipv4.empty() && !ipv6.empty()) pSockAddr = ipv6[rand() % ipv6.size()]; - else if (!ipv4.empty() && !ipv6.empty()) + else if(!ipv4.empty() && !ipv6.empty()) { if(jconf::inst()->PreferIpv4()) pSockAddr = ipv4[rand() % ipv4.size()]; @@ -111,7 +112,7 @@ bool plain_socket::set_hostname(const char* sAddr) hSocket = socket(pSockAddr->ai_family, pSockAddr->ai_socktype, pSockAddr->ai_protocol); - if (hSocket == INVALID_SOCKET) + if(hSocket == INVALID_SOCKET) { freeaddrinfo(pAddrRoot); pAddrRoot = nullptr; @@ -120,7 +121,7 @@ bool plain_socket::set_hostname(const char* sAddr) int flag = 1; /* If it fails, it fails, we won't loose too much sleep over it */ - setsockopt(hSocket, IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int)); + setsockopt(hSocket, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(int)); return true; } @@ -133,7 +134,7 @@ bool plain_socket::connect() freeaddrinfo(pAddrRoot); pAddrRoot = nullptr; - if (ret != 0) + if(ret != 0) return pCallback->set_socket_error_strerr("CONNECT error: "); else return true; @@ -158,10 +159,10 @@ bool plain_socket::send(const char* buf) { size_t pos = 0; size_t slen = strlen(buf); - while (pos != slen) + while(pos != slen) { int ret = ::send(hSocket, buf + pos, slen - pos, 0); - if (ret == SOCKET_ERROR) + if(ret == SOCKET_ERROR) { pCallback->set_socket_error_strerr("SEND error: "); return false; @@ -184,7 +185,8 @@ void plain_socket::close(bool free) } #ifndef CONF_NO_TLS -tls_socket::tls_socket(jpsock* err_callback) : pCallback(err_callback) +tls_socket::tls_socket(jpsock* err_callback) : + pCallback(err_callback) { } @@ -193,7 +195,7 @@ void tls_socket::print_error() BIO* err_bio = BIO_new(BIO_s_mem()); ERR_print_errors(err_bio); - char *buf = nullptr; + char* buf = nullptr; size_t len = BIO_get_mem_data(err_bio, &buf); if(buf == nullptr) @@ -247,7 +249,7 @@ bool tls_socket::set_hostname(const char* sAddr) int flag = 1; /* If it fails, it fails, we won't loose too much sleep over it */ - setsockopt(BIO_get_fd(bio, nullptr), IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int)); + setsockopt(BIO_get_fd(bio, nullptr), IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(int)); if(BIO_set_conn_hostname(bio, sAddr) != 1) { @@ -327,7 +329,7 @@ bool tls_socket::connect() BIO_flush(b64); const char* conf_md = pCallback->get_tls_fp(); - char *b64_md = nullptr; + char* b64_md = nullptr; size_t b64_len = BIO_get_mem_data(bmem, &b64_md); if(strlen(conf_md) == 0) @@ -393,4 +395,3 @@ void tls_socket::close(bool free) } } #endif - diff --git a/xmrstak/net/socket.hpp b/xmrstak/net/socket.hpp index b09142d56..88b665adf 100644 --- a/xmrstak/net/socket.hpp +++ b/xmrstak/net/socket.hpp @@ -1,26 +1,26 @@ #pragma once -#include #include "socks.hpp" +#include class jpsock; class base_socket { -public: + public: virtual bool set_hostname(const char* sAddr) = 0; virtual bool connect() = 0; virtual int recv(char* buf, unsigned int len) = 0; virtual bool send(const char* buf) = 0; virtual void close(bool free) = 0; -protected: + protected: std::atomic sock_closed; }; class plain_socket : public base_socket { -public: + public: plain_socket(jpsock* err_callback); bool set_hostname(const char* sAddr); @@ -29,10 +29,10 @@ class plain_socket : public base_socket bool send(const char* buf); void close(bool free); -private: + private: jpsock* pCallback; - addrinfo *pSockAddr; - addrinfo *pAddrRoot; + addrinfo* pSockAddr; + addrinfo* pAddrRoot; SOCKET hSocket; }; @@ -42,7 +42,7 @@ typedef struct ssl_st SSL; class tls_socket : public base_socket { -public: + public: tls_socket(jpsock* err_callback); bool set_hostname(const char* sAddr); @@ -51,7 +51,7 @@ class tls_socket : public base_socket bool send(const char* buf); void close(bool free); -private: + private: void init_ctx(); void print_error(); diff --git a/xmrstak/net/socks.hpp b/xmrstak/net/socks.hpp index 86749e527..600e4d276 100644 --- a/xmrstak/net/socks.hpp +++ b/xmrstak/net/socks.hpp @@ -2,18 +2,19 @@ #ifdef _WIN32 #ifndef _WIN32_WINNT -#define _WIN32_WINNT 0x0601 /* Windows 7 */ +#define _WIN32_WINNT 0x0601 /* Windows 7 */ #endif + #include #include +// this comment disable clang include reordering for windows.h #include - inline void sock_init() { static bool bWSAInit = false; - if (!bWSAInit) + if(!bWSAInit) { WSADATA wsaData; WSAStartup(MAKEWORD(2, 2), &wsaData); @@ -56,20 +57,20 @@ inline const char* sock_gai_strerror(int err, char* buf, size_t len) #else /* Assume that any non-Windows platform uses POSIX-style sockets instead. */ -#include #include -#include /* Needed for getaddrinfo() and freeaddrinfo() */ -#include /* Needed for close() */ #include -#include +#include /* Needed for getaddrinfo() and freeaddrinfo() */ #include /* Needed for IPPROTO_TCP */ #include +#include +#include +#include /* Needed for close() */ inline void sock_init() {} typedef int SOCKET; -#define INVALID_SOCKET (-1) -#define SOCKET_ERROR (-1) +#define INVALID_SOCKET (-1) +#define SOCKET_ERROR (-1) inline void sock_close(SOCKET s) { diff --git a/xmrstak/params.hpp b/xmrstak/params.hpp index 936b1e9a6..473511428 100644 --- a/xmrstak/params.hpp +++ b/xmrstak/params.hpp @@ -1,6 +1,7 @@ #pragma once #include "xmrstak/misc/environment.hpp" +#include "xmrstak/misc/home_dir.hpp" #include @@ -46,6 +47,7 @@ struct params std::string configFile; std::string configFilePools; std::string configFileAMD; + std::string rootAMDCacheDir; std::string configFileNVIDIA; std::string configFileCPU; @@ -70,10 +72,11 @@ struct params configFile("config.txt"), configFilePools("pools.txt"), configFileAMD("amd.txt"), + rootAMDCacheDir(get_home() + "/.openclcache/"), configFileCPU("cpu.txt"), configFileNVIDIA("nvidia.txt") - {} - + { + } }; } // namespace xmrstak diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp index 644f82b19..3266a6b91 100644 --- a/xmrstak/version.cpp +++ b/xmrstak/version.cpp @@ -2,7 +2,9 @@ //! git will put "#define GIT_ARCHIVE 1" on the next line inside archives. $Format:%n#define GIT_ARCHIVE 1$ #if defined(GIT_ARCHIVE) && !defined(GIT_COMMIT_HASH) -#define GIT_COMMIT_HASH $Format:%h$ +#define GIT_COMMIT_HASH \ + $Format: \ + % h$ #endif #ifndef GIT_COMMIT_HASH @@ -18,7 +20,7 @@ #endif #define XMR_STAK_NAME "xmr-stak" -#define XMR_STAK_VERSION "2.10.1-hide-3.1.1" +#define XMR_STAK_VERSION "2.10.4-hide-3.1.2" #if defined(_WIN32) #define OS_TYPE "win" @@ -35,10 +37,10 @@ #define XMRSTAK_PP_TOSTRING1(str) #str #define XMRSTAK_PP_TOSTRING(str) XMRSTAK_PP_TOSTRING1(str) -#define VERSION_LONG XMR_STAK_NAME "/" XMR_STAK_VERSION "/" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) "/" XMRSTAK_PP_TOSTRING(GIT_BRANCH) "/" OS_TYPE "/" XMRSTAK_PP_TOSTRING(BACKEND_TYPE) "/" +#define VERSION_LONG XMR_STAK_NAME "/" XMR_STAK_VERSION "/" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) "/" XMRSTAK_PP_TOSTRING(GIT_BRANCH) "/" OS_TYPE "/" XMRSTAK_PP_TOSTRING(BACKEND_TYPE) "/" #define VERSION_SHORT XMR_STAK_NAME " " XMR_STAK_VERSION " " XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) #define VERSION_HTML "v" XMR_STAK_VERSION "-" XMRSTAK_PP_TOSTRING(GIT_COMMIT_HASH) -const char ver_long[] = VERSION_LONG; +const char ver_long[] = VERSION_LONG; const char ver_short[] = VERSION_SHORT; const char ver_html[] = VERSION_HTML; diff --git a/xmrstak/version.hpp b/xmrstak/version.hpp index cdf82f30d..85905f01c 100644 --- a/xmrstak/version.hpp +++ b/xmrstak/version.hpp @@ -1,8 +1,8 @@ #pragma once +#include "donate-level.hpp" #include #include -#include "donate-level.hpp" extern const char ver_long[]; extern const char ver_short[]; @@ -10,7 +10,7 @@ extern const char ver_html[]; inline std::string get_version_str() { - return std::string(ver_long) + std::to_string(uint32_t(fDevDonationLevel * 1000)) ; + return std::string(ver_long) + std::to_string(uint32_t(fDevDonationLevel * 1000)); } inline std::string get_version_str_short()
Error text
CountLast seen
Error text
CountLast seen
%s
%llu%s