diff --git a/.appveyor.yml b/.appveyor.yml
index c336842d5..fee8f6524 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -12,7 +12,7 @@ install:
   - curl -sL https://github.com/fireice-uk/xmr-stak-dep/releases/download/v1/xmr-stak-dep.zip -o xmr-stak-dep.zip
   - 7z x xmr-stak-dep.zip -o"c:\xmr-stak-dep" -y > nul
   - appveyor DownloadFile  https://developer.nvidia.com/compute/cuda/8.0/prod/local_installers/cuda_8.0.44_windows-exe -FileName cuda_8.0.44_windows.exe
-  - cuda_8.0.44_windows.exe -s compiler_8.0 cudart_8.0
+  - cuda_8.0.44_windows.exe -s compiler_8.0 cudart_8.0 nvrtc_8.0 nvrtc_dev_8.0
   - set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v8.0\libnvvp;%PATH%
   - nvcc -V
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b714ee0ce..a5c06df8a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,13 @@ endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
 
 # help to find cuda on systems with a software module system
 list(APPEND CMAKE_PREFIX_PATH "$ENV{CUDA_ROOT}")
+
+# help to find AMD OCL SDK Light (replaced APP SDK)
+list(APPEND CMAKE_PREFIX_PATH "$ENV{OCL_ROOT}")
+
+# help to find AMD app SDK on systems with a software module system
+list(APPEND CMAKE_PREFIX_PATH "$ENV{AMDAPPSDKROOT}")
+
 # allow user to extent CMAKE_PREFIX_PATH via environment variable
 list(APPEND CMAKE_PREFIX_PATH "$ENV{CMAKE_PREFIX_PATH}")
 
@@ -63,6 +70,42 @@ if(CUDA_ENABLE)
     find_package(CUDA 7.5)
 
     if(CUDA_FOUND)
+        # required for monero's cryptonight_r
+        # libcuda
+        find_library(CUDA_LIB 
+        NAMES
+            libcuda 
+            cuda 
+            cuda.lib
+        HINTS
+            ${CUDA_TOOLKIT_ROOT_DIR}
+            ${LIBCUDA_LIBRARY_DIR}
+            ${CUDA_TOOLKIT_ROOT_DIR}
+            /usr
+            /usr/local/cuda
+        PATH_SUFFIXES
+            lib64	
+            lib/x64
+            lib/Win32
+            lib64/stubs)
+
+        #nvrtc
+        find_library(CUDA_NVRTC_LIB 
+        NAMES
+            libnvrtc 
+            nvrtc 
+            nvrtc.lib
+        HINTS 
+            ${CUDA_TOOLKIT_ROOT_DIR} 
+            ${LIBNVRTC_LIBRARY_DIR}
+            ${CUDA_TOOLKIT_ROOT_DIR}
+            /usr 
+            /usr/local/cuda
+        PATH_SUFFIXES
+            lib64
+            lib/x64
+            lib/Win32)
+
         list(APPEND BACKEND_TYPES "nvidia")
         option(XMR-STAK_LARGEGRID "Support large CUDA block count > 128" ON)
         if(XMR-STAK_LARGEGRID)
@@ -152,6 +195,9 @@ if(CUDA_ENABLE)
                 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
             endif()
 
+            # required for cryptonight_gpu (fast floating point operations are not allowed)
+            set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --fmad=false --prec-div=true --ftz=false")
+
             # avoid that nvcc in CUDA 8 complains about sm_20 pending removal
             if(CUDA_VERSION VERSION_EQUAL 8.0)
                 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Wno-deprecated-gpu-targets")
@@ -190,16 +236,11 @@ if(CUDA_ENABLE)
                 set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}" "-D_MWAITXINTRIN_H_INCLUDED")
             endif()
 
-            if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" AND
-                (CUDA_VERSION VERSION_EQUAL 9.0 OR
-                CUDA_VERSION VERSION_EQUAL 9.1 OR
-                CUDA_VERSION VERSION_EQUAL 9.2 OR
-                CUDA_VERSION VERSION_EQUAL 10.0)
-            )
-                # workaround find_package(CUDA) is using the wrong path to the CXX host compiler
-                # overwrite the CUDA host compiler variable with the used CXX MSVC
-                set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE FILEPATH "Host side compiler used by NVCC" FORCE)
-            endif()
+            # workaround find_package(CUDA) is using the wrong path to the CXX host compiler
+            # overwrite the CUDA host compiler variable with the used CXX MSVC
+            # in linux where clang and gcc is installed it also helps to select the correct host compiler
+            set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER} CACHE FILEPATH "Host side compiler used by NVCC" FORCE)
+
         else()
             message(FATAL_ERROR "selected CUDA compiler '${CUDA_COMPILER}' is not supported")
         endif()
@@ -210,11 +251,6 @@ else()
     add_definitions("-DCONF_NO_CUDA")
 endif()
 
-# help to find AMD app SDK on systems with a software module system
-list(APPEND CMAKE_PREFIX_PATH "$ENV{AMDAPPSDKROOT}")
-# allow user to extent CMAKE_PREFIX_PATH via environment variable
-list(APPEND CMAKE_PREFIX_PATH "$ENV{CMAKE_PREFIX_PATH}")
-
 ###############################################################################
 # Find OpenCL
 ###############################################################################
@@ -228,6 +264,7 @@ if(OpenCL_ENABLE)
             OpenCL/cl.h
         NO_DEFAULT_PATH
         PATHS
+            ENV "OCL_ROOT"
             ENV "OpenCL_ROOT"
             ENV AMDAPPSDKROOT
             ENV ATISTREAMSDKROOT
@@ -244,6 +281,7 @@ if(OpenCL_ENABLE)
             OpenCL.lib
         NO_DEFAULT_PATH
         PATHS
+            ENV "OCL_ROOT"
             ENV "OpenCL_ROOT"
             ENV AMDAPPSDKROOT
             ENV ATISTREAMSDKROOT
@@ -279,6 +317,14 @@ else()
     list(APPEND BACKEND_TYPES "cpu")
 endif()
 
+################################################################################
+# Explicit march setting for Clang
+################################################################################
+
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
+	set_source_files_properties(xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx2")
+endif()
+
 ################################################################################
 # Find PThreads
 ################################################################################
@@ -532,6 +578,8 @@ if(CUDA_FOUND)
             ${CUDASRCFILES}
         )
     endif()
+
+    set(CUDA_LIBRARIES ${CUDA_LIB} ${CUDA_NVRTC_LIB} ${CUDA_LIBRARIES})
     target_link_libraries(xmrstak_cuda_backend ${CUDA_LIBRARIES})
     target_link_libraries(xmrstak_cuda_backend xmr-stak-backend xmr-stak-asm)
 endif()
diff --git a/README.md b/README.md
index 61e6ccede..6327e049a 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 ###### fireice-uk's and psychocrypt's
 # XMR-Stak - Cryptonight All-in-One Mining Software
 
-**You must update to version [2.5.1-hide-2.0.0+](https://github.com/rapid821/xmr-stak-hide/releases) before October 18th 2018, if you want to mine Monero.**
+**You must update to version [2.9.0-hide-3.0.0+](https://github.com/rapid821/xmr-stak-hide/releases) before March 9th 2019, if you want to mine Monero.**
 
 XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used to mine the crypto currencies Monero, Aeon and many more Cryptonight coins.
 
@@ -9,6 +9,8 @@ In addition to the regular XMR-Stak you can add the --hide paramater to your win
 
 If you have any question, just ceate an issue [here](https://github.com/rapid821/xmr-stak-hide/issues).
 
+XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA GPUs and can be used to mine the crypto currencies Monero, Aeon and many more Cryptonight coins.
+
 ## HTML reports
 <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-hashrate.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-results.png" width="260"> <img src="https://gist.githubusercontent.com/fireice-uk/2da301131ac01695ff79539a27b81d68/raw/4c09cdeee86f94df2e9dd86b927e64aded6184f5/xmr-stak-cpu-connection.png" width="260">
 
@@ -47,29 +49,37 @@ Besides [Monero](https://getmonero.org), following coins can be mined using this
 - [Aeon](http://www.aeon.cash)
 - [BBSCoin](https://www.bbscoin.xyz)
 - [BitTube](https://coin.bit.tube/)
+- [Conceal](https://conceal.network)
 - [Graft](https://www.graft.network)
 - [Haven](https://havenprotocol.com)
-- [Intense](https://intensecoin.com)
+- [Lethean](https://lethean.io)
 - [Masari](https://getmasari.org)
+- [Plenteum](https://www.plenteum.com/)
 - [QRL](https://theqrl.org)
 - **[Ryo](https://ryo-currency.com) - Upcoming xmr-stak-gui is sponsored by Ryo**
+- [Stellite](https://stellite.cash/)
 - [TurtleCoin](https://turtlecoin.lol)
+- [Zelerius](https://zelerius.org/)
 
 Ryo currency is a way for us to implement the ideas that we were unable to in
 Monero. See [here](https://github.com/fireice-uk/cryptonote-speedup-demo/) for details.
 
 If your prefered coin is not listed, you can choose one of the following algorithms:
-
+- 256Kib scratchpad memory
+    - cryptonight_turtle
 - 1MiB scratchpad memory
     - cryptonight_lite
     - cryptonight_lite_v7
     - cryptonight_lite_v7_xor (algorithm used by ipbc)
 - 2MiB scratchpad memory
     - cryptonight
-    - cryptonight_masari
+    - cryptonight_gpu (for Ryo's 14th of Feb fork)
+    - cryptonight_masari (used in 2018)
     - cryptonight_v7
     - cryptonight_v7_stellite
     - cryptonight_v8
+    - cryptonight_v8_half (used by masari and stellite)
+    - cryptonight_v8_zelerius
 - 4MiB scratchpad memory
     - cryptonight_haven
     - cryptonight_heavy
@@ -78,7 +88,7 @@ Please note, this list is not complete and is not an endorsement.
 
 ## Download
 
-You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/rapid821/xmr-stak-hide/releases).
+You can find the latest releases and precompiled binaries on GitHub under [Releases](https://github.com/fireice-uk/xmr-stak/releases).
 
 ## Default Developer Donation
 
diff --git a/doc/compile_Linux.md b/doc/compile_Linux.md
index ebf115430..6c80bc56a 100644
--- a/doc/compile_Linux.md
+++ b/doc/compile_Linux.md
@@ -9,10 +9,8 @@
 - run `./amdgpu-pro-install --opencl=legacy,pal` from the unzipped folder
 - set the environment variable to opencl `export AMDAPPSDKROOT=/opt/amdgpu-pro/`
 
-**ATTENTION** The linux driver 18.3 creating invalid shares. 
-If you have an issue with `invalid shares` please downgrade your driver or switch to ROCm.
-
 For linux also the OpenSource driver ROCm 1.9.X+ is a well working alternative, see https://rocm.github.io/ROCmInstall.html
+ROCm is not supporting old GPUs please check if your GPU is supported https://rocm.github.io/hardware.html.
 
 ### Cuda 8.0+ (only needed to use NVIDIA GPUs)
 
diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md
index 8fe4dcf53..64d68bab1 100644
--- a/doc/compile_Windows.md
+++ b/doc/compile_Windows.md
@@ -34,9 +34,6 @@
 
 - Download & install the AMD driver: https://www.amd.com/en/support
 
-**ATTENTION** Many windows driver 18.5+ creating invalid shares.
-If you have an issue with `invalid shares` please downgrade your driver.
-
 - Download and install the latest version of the OCL-SDK from https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases 
 
 Do not follow old information that you need the AMD APP SDK. AMD has removed the APP SDK and is now shipping the OCL-SDK_light.
diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.cpp b/xmrstak/backend/amd/OclCryptonightR_gen.cpp
new file mode 100644
index 000000000..4aabe51d0
--- /dev/null
+++ b/xmrstak/backend/amd/OclCryptonightR_gen.cpp
@@ -0,0 +1,354 @@
+#include <string>
+#include <sstream>
+#include <mutex>
+#include <cstring>
+#include <thread>
+
+
+#include "xmrstak/backend/amd/OclCryptonightR_gen.hpp"
+#include "xmrstak/backend/cpu/crypto/variant4_random_math.h"
+#include "xmrstak/misc/console.hpp"
+#include "xmrstak/cpputil/read_write_lock.h"
+
+#include <chrono>
+#include <thread>
+#include <iostream>
+
+
+namespace xmrstak
+{
+namespace amd
+{
+
+static std::string get_code(const V4_Instruction* code, int code_size)
+{
+    std::stringstream s;
+
+	for (int i = 0; i < code_size; ++i)
+	{
+		const V4_Instruction inst = code[i];
+
+		const uint32_t a = inst.dst_index;
+		const uint32_t b = inst.src_index;
+
+		switch (inst.opcode)
+		{
+		case MUL:
+			s << 'r' << a << "*=r" << b << ';';
+			break;
+
+		case ADD:
+			s << 'r' << a << "+=r" << b << '+' << inst.C << "U;";
+			break;
+
+		case SUB:
+			s << 'r' << a << "-=r" << b << ';';
+			break;
+
+		case ROR:
+		case ROL:
+			s << 'r' << a << "=rotate(r" << a << ((inst.opcode == ROR) ? ",ROT_BITS-r" : ",r") << b << ");";
+			break;
+
+		case XOR:
+			s << 'r' << a << "^=r" << b << ';';
+			break;
+		}
+
+		s << '\n';
+	}
+
+    return s.str();
+}
+
+struct CacheEntry
+{
+    CacheEntry(xmrstak_algo algo, uint64_t height, size_t deviceIdx, cl_program program) :
+        algo(algo),
+        height(height),
+        deviceIdx(deviceIdx),
+        program(program)
+    {}
+
+    xmrstak_algo algo;
+    uint64_t height;
+    size_t deviceIdx;
+    cl_program program;
+};
+
+struct BackgroundTaskBase
+{
+    virtual ~BackgroundTaskBase() {}
+    virtual void exec() = 0;
+};
+
+template<typename T>
+struct BackgroundTask : public BackgroundTaskBase
+{
+    BackgroundTask(T&& func) : m_func(std::move(func)) {}
+    void exec() override { m_func(); }
+
+    T m_func;
+};
+
+static ::cpputil::RWLock CryptonightR_cache_mutex;
+static std::mutex CryptonightR_build_mutex;
+static std::vector<CacheEntry> CryptonightR_cache;
+
+static std::mutex background_tasks_mutex;
+static std::vector<BackgroundTaskBase*> background_tasks;
+static std::thread* background_thread = nullptr;
+
+static void background_thread_proc()
+{
+    std::vector<BackgroundTaskBase*> tasks;
+    for (;;) {
+        tasks.clear();
+        {
+            std::lock_guard<std::mutex> g(background_tasks_mutex);
+            background_tasks.swap(tasks);
+        }
+
+        for (BackgroundTaskBase* task : tasks) {
+            task->exec();
+            delete task;
+        }
+
+		std::this_thread::sleep_for(std::chrono::milliseconds(500));
+    }
+}
+
+template<typename T>
+static void background_exec(T&& func)
+{
+    BackgroundTaskBase* task = new BackgroundTask<T>(std::move(func));
+
+    std::lock_guard<std::mutex> g(background_tasks_mutex);
+    background_tasks.push_back(task);
+    if (!background_thread) {
+        background_thread = new std::thread(background_thread_proc);
+    }
+}
+
+static cl_program CryptonightR_build_program(
+    const GpuContext* ctx,
+    xmrstak_algo algo,
+    uint64_t height,
+    cl_kernel old_kernel,
+    std::string source_code,
+    std::string options)
+{
+    if(old_kernel)
+        clReleaseKernel(old_kernel);
+
+
+    std::vector<cl_program> old_programs;
+    old_programs.reserve(32);
+    {
+		CryptonightR_cache_mutex.WriteLock();
+
+        // Remove old programs from cache
+        for(size_t i = 0; i < CryptonightR_cache.size();)
+        {
+            const CacheEntry& entry = CryptonightR_cache[i];
+            if ((entry.algo == algo) && (entry.height + 2 < height))
+            {
+                printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
+                old_programs.push_back(entry.program);
+                CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
+                CryptonightR_cache.pop_back();
+            }
+            else
+            {
+                ++i;
+            }
+        }
+		CryptonightR_cache_mutex.UnLock();
+    }
+
+    for(cl_program p : old_programs) {
+        clReleaseProgram(p);
+    }
+
+    std::lock_guard<std::mutex> g1(CryptonightR_build_mutex);
+
+    cl_program program = nullptr;
+    {
+		CryptonightR_cache_mutex.ReadLock();
+
+        // Check if the cache already has this program (some other thread might have added it first)
+        for (const CacheEntry& entry : CryptonightR_cache)
+        {
+            if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx))
+            {
+                program = entry.program;
+                break;
+            }
+        }
+		CryptonightR_cache_mutex.UnLock();
+    }
+
+    if (program) {
+        return program;
+    }
+
+	cl_int ret;
+	const char* source = source_code.c_str();
+
+	program = clCreateProgramWithSource(ctx->opencl_ctx, 1, (const char**)&source, NULL, &ret);
+	if(ret != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L0,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
+		return program;
+	}
+
+	ret = clBuildProgram(program, 1, &ctx->DeviceID, options.c_str(), NULL, NULL);
+	if(ret != CL_SUCCESS)
+	{
+		size_t len;
+		printer::inst()->print_msg(L0,"Error %s when calling clBuildProgram.", err_to_str(ret));
+
+		if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
+			return program;
+		}
+
+		char* BuildLog = (char*)malloc(len + 1);
+		BuildLog[0] = '\0';
+
+		if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
+		{
+			free(BuildLog);
+			printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
+			return program;
+		}
+
+		printer::inst()->print_str("Build log:\n");
+		std::cerr<<BuildLog<<std::endl;
+
+		free(BuildLog);
+		return program;
+	}
+
+	cl_build_status status;
+	do
+	{
+		if((ret = clGetProgramBuildInfo(program, ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L0,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
+			return program;
+		}
+		std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+	}
+	while(status == CL_BUILD_IN_PROGRESS);
+
+
+    printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height);
+
+	CryptonightR_cache_mutex.WriteLock();
+	CryptonightR_cache.emplace_back(algo, height, ctx->deviceIdx, program);
+	CryptonightR_cache_mutex.UnLock();
+    return program;
+}
+
+cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t height, bool background, cl_kernel old_kernel)
+{
+    if (background) {
+        background_exec([=](){ CryptonightR_get_program(ctx, algo, height, false, old_kernel); });
+        return nullptr;
+    }
+
+    const char* source_code_template =
+        #include "amd_gpu/opencl/wolf-aes.cl"
+        #include "amd_gpu/opencl/cryptonight_r.cl"
+    ;
+    const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH";
+    const char* offset = strstr(source_code_template, include_name);
+    if (!offset)
+    {
+        printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cryptonight_r.cl", algo);
+        return nullptr;
+    }
+
+    V4_Instruction code[256];
+    int code_size;
+    switch (algo.Id())
+    {
+    case cryptonight_r_wow:
+        code_size = v4_random_math_init<cryptonight_r_wow>(code, height);
+        break;
+    case cryptonight_r:
+        code_size = v4_random_math_init<cryptonight_r>(code, height);
+        break;
+    default:
+        printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo);
+        return nullptr;
+    }
+
+    std::string source_code(source_code_template, offset);
+    source_code.append(get_code(code, code_size));
+    source_code.append(offset + sizeof(include_name) - 1);
+
+	// scratchpad size for the selected mining algorithm
+	size_t hashMemSize = algo.Mem();
+	int threadMemMask = algo.Mask();
+	int hashIterations = algo.Iter();
+
+	size_t mem_chunk_exp = 1u << ctx->memChunk;
+	size_t strided_index = ctx->stridedIndex;
+	/* Adjust the config settings to a valid combination
+	 * this is required if the dev pool is mining monero
+	 * but the user tuned there settings for another currency
+	 */
+	if(algo == cryptonight_r || algo == cryptonight_r_wow)
+	{
+		if(ctx->memChunk < 2)
+			mem_chunk_exp = 1u << 2;
+		if(strided_index == 1)
+			strided_index = 0;
+	}
+
+	// if intensity is a multiple of worksize than comp mode is not needed
+	int needCompMode = ctx->compMode && ctx->rawIntensity % ctx->workSize != 0 ? 1 : 0;
+
+	std::string options;
+	options += " -DITERATIONS=" + std::to_string(hashIterations);
+	options += " -DMASK=" + std::to_string(threadMemMask) + "U";
+	options += " -DWORKSIZE=" + std::to_string(ctx->workSize) + "U";
+	options += " -DSTRIDED_INDEX=" + std::to_string(strided_index);
+	options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp) + "U";
+	options += " -DCOMP_MODE=" + std::to_string(needCompMode);
+	options += " -DMEMORY=" + std::to_string(hashMemSize) + "LU";
+	options += " -DALGO=" + std::to_string(algo.Id());
+	options += " -DCN_UNROLL=" + std::to_string(ctx->unroll);
+
+	if(algo == cryptonight_gpu)
+		options += " -cl-fp32-correctly-rounded-divide-sqrt";
+
+
+    const char* source = source_code.c_str();
+
+    {
+		CryptonightR_cache_mutex.ReadLock();
+
+        // Check if the cache has this program
+        for (const CacheEntry& entry : CryptonightR_cache)
+        {
+            if ((entry.algo == algo) && (entry.height == height) && (entry.deviceIdx == ctx->deviceIdx))
+            {
+                printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height);
+				auto result = entry.program;
+				CryptonightR_cache_mutex.UnLock();
+                return result;
+            }
+        }
+		CryptonightR_cache_mutex.UnLock();
+
+    }
+
+    return CryptonightR_build_program(ctx, algo, height, old_kernel, source, options);
+}
+
+} // namespace amd
+} // namespace xmrstak
diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.hpp b/xmrstak/backend/amd/OclCryptonightR_gen.hpp
new file mode 100644
index 000000000..a69df9074
--- /dev/null
+++ b/xmrstak/backend/amd/OclCryptonightR_gen.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "xmrstak/backend/cryptonight.hpp"
+
+#include <stdint.h>
+#include <vector>
+#include <string>
+
+#if defined(__APPLE__)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include "xmrstak/backend/amd/amd_gpu/gpu.hpp"
+
+namespace xmrstak
+{
+namespace amd
+{
+
+cl_program CryptonightR_get_program(GpuContext* ctx, const xmrstak_algo algo,
+	uint64_t height, bool background = false, cl_kernel old_kernel = nullptr);
+
+} // namespace amd
+} // namespace xmrstak
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 408cad97a..a2cbe8f54 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -19,6 +19,7 @@
 #include "xmrstak/params.hpp"
 #include "xmrstak/version.hpp"
 #include "xmrstak/net/msgstruct.hpp"
+#include "xmrstak/backend/amd/OclCryptonightR_gen.hpp"
 
 #include <stdio.h>
 #include <string.h>
@@ -104,143 +105,6 @@ static inline long long unsigned int int_port(size_t i)
 
 #include "gpu.hpp"
 
-const char* err_to_str(cl_int ret)
-{
-	switch(ret)
-	{
-	case CL_SUCCESS:
-		return "CL_SUCCESS";
-	case CL_DEVICE_NOT_FOUND:
-		return "CL_DEVICE_NOT_FOUND";
-	case CL_DEVICE_NOT_AVAILABLE:
-		return "CL_DEVICE_NOT_AVAILABLE";
-	case CL_COMPILER_NOT_AVAILABLE:
-		return "CL_COMPILER_NOT_AVAILABLE";
-	case CL_MEM_OBJECT_ALLOCATION_FAILURE:
-		return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
-	case CL_OUT_OF_RESOURCES:
-		return "CL_OUT_OF_RESOURCES";
-	case CL_OUT_OF_HOST_MEMORY:
-		return "CL_OUT_OF_HOST_MEMORY";
-	case CL_PROFILING_INFO_NOT_AVAILABLE:
-		return "CL_PROFILING_INFO_NOT_AVAILABLE";
-	case CL_MEM_COPY_OVERLAP:
-		return "CL_MEM_COPY_OVERLAP";
-	case CL_IMAGE_FORMAT_MISMATCH:
-		return "CL_IMAGE_FORMAT_MISMATCH";
-	case CL_IMAGE_FORMAT_NOT_SUPPORTED:
-		return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
-	case CL_BUILD_PROGRAM_FAILURE:
-		return "CL_BUILD_PROGRAM_FAILURE";
-	case CL_MAP_FAILURE:
-		return "CL_MAP_FAILURE";
-	case CL_MISALIGNED_SUB_BUFFER_OFFSET:
-		return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
-	case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
-		return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
-#ifdef CL_VERSION_1_2
-	case CL_COMPILE_PROGRAM_FAILURE:
-		return "CL_COMPILE_PROGRAM_FAILURE";
-	case CL_LINKER_NOT_AVAILABLE:
-		return "CL_LINKER_NOT_AVAILABLE";
-	case CL_LINK_PROGRAM_FAILURE:
-		return "CL_LINK_PROGRAM_FAILURE";
-	case CL_DEVICE_PARTITION_FAILED:
-		return "CL_DEVICE_PARTITION_FAILED";
-	case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
-		return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
-#endif
-	case CL_INVALID_VALUE:
-		return "CL_INVALID_VALUE";
-	case CL_INVALID_DEVICE_TYPE:
-		return "CL_INVALID_DEVICE_TYPE";
-	case CL_INVALID_PLATFORM:
-		return "CL_INVALID_PLATFORM";
-	case CL_INVALID_DEVICE:
-		return "CL_INVALID_DEVICE";
-	case CL_INVALID_CONTEXT:
-		return "CL_INVALID_CONTEXT";
-	case CL_INVALID_QUEUE_PROPERTIES:
-		return "CL_INVALID_QUEUE_PROPERTIES";
-	case CL_INVALID_COMMAND_QUEUE:
-		return "CL_INVALID_COMMAND_QUEUE";
-	case CL_INVALID_HOST_PTR:
-		return "CL_INVALID_HOST_PTR";
-	case CL_INVALID_MEM_OBJECT:
-		return "CL_INVALID_MEM_OBJECT";
-	case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-		return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-	case CL_INVALID_IMAGE_SIZE:
-		return "CL_INVALID_IMAGE_SIZE";
-	case CL_INVALID_SAMPLER:
-		return "CL_INVALID_SAMPLER";
-	case CL_INVALID_BINARY:
-		return "CL_INVALID_BINARY";
-	case CL_INVALID_BUILD_OPTIONS:
-		return "CL_INVALID_BUILD_OPTIONS";
-	case CL_INVALID_PROGRAM:
-		return "CL_INVALID_PROGRAM";
-	case CL_INVALID_PROGRAM_EXECUTABLE:
-		return "CL_INVALID_PROGRAM_EXECUTABLE";
-	case CL_INVALID_KERNEL_NAME:
-		return "CL_INVALID_KERNEL_NAME";
-	case CL_INVALID_KERNEL_DEFINITION:
-		return "CL_INVALID_KERNEL_DEFINITION";
-	case CL_INVALID_KERNEL:
-		return "CL_INVALID_KERNEL";
-	case CL_INVALID_ARG_INDEX:
-		return "CL_INVALID_ARG_INDEX";
-	case CL_INVALID_ARG_VALUE:
-		return "CL_INVALID_ARG_VALUE";
-	case CL_INVALID_ARG_SIZE:
-		return "CL_INVALID_ARG_SIZE";
-	case CL_INVALID_KERNEL_ARGS:
-		return "CL_INVALID_KERNEL_ARGS";
-	case CL_INVALID_WORK_DIMENSION:
-		return "CL_INVALID_WORK_DIMENSION";
-	case CL_INVALID_WORK_GROUP_SIZE:
-		return "CL_INVALID_WORK_GROUP_SIZE";
-	case CL_INVALID_WORK_ITEM_SIZE:
-		return "CL_INVALID_WORK_ITEM_SIZE";
-	case CL_INVALID_GLOBAL_OFFSET:
-		return "CL_INVALID_GLOBAL_OFFSET";
-	case CL_INVALID_EVENT_WAIT_LIST:
-		return "CL_INVALID_EVENT_WAIT_LIST";
-	case CL_INVALID_EVENT:
-		return "CL_INVALID_EVENT";
-	case CL_INVALID_OPERATION:
-		return "CL_INVALID_OPERATION";
-	case CL_INVALID_GL_OBJECT:
-		return "CL_INVALID_GL_OBJECT";
-	case CL_INVALID_BUFFER_SIZE:
-		return "CL_INVALID_BUFFER_SIZE";
-	case CL_INVALID_MIP_LEVEL:
-		return "CL_INVALID_MIP_LEVEL";
-	case CL_INVALID_GLOBAL_WORK_SIZE:
-		return "CL_INVALID_GLOBAL_WORK_SIZE";
-	case CL_INVALID_PROPERTY:
-		return "CL_INVALID_PROPERTY";
-#ifdef CL_VERSION_1_2
-	case CL_INVALID_IMAGE_DESCRIPTOR:
-		return "CL_INVALID_IMAGE_DESCRIPTOR";
-	case CL_INVALID_COMPILER_OPTIONS:
-		return "CL_INVALID_COMPILER_OPTIONS";
-	case CL_INVALID_LINKER_OPTIONS:
-		return "CL_INVALID_LINKER_OPTIONS";
-	case CL_INVALID_DEVICE_PARTITION_COUNT:
-		return "CL_INVALID_DEVICE_PARTITION_COUNT";
-#endif
-#if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2)
-	case CL_INVALID_PIPE_SIZE:
-		return "CL_INVALID_PIPE_SIZE";
-	case CL_INVALID_DEVICE_QUEUE:
-		return "CL_INVALID_DEVICE_QUEUE";
-#endif
-	default:
-		return "UNKNOWN_ERROR";
-	}
-}
-
 #if 0
 void printer::inst()->print_msg(L1,const char* fmt, ...);
 void printer::inst()->print_str(const char* str);
@@ -284,11 +148,37 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	/* Some kernel spawn 8 times more threads than the user is configuring.
-	 * To give the user the correct maximum work size we divide the hardware specific max by 8.
-	 */
-	MaximumWorkSize /= 8;
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+	bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end();
+
+	if(useCryptonight_gpu)
+	{
+		// work cn_1 we use 16x more threads than configured by the user
+		MaximumWorkSize /= 16;
+	}
+	else
+	{
+		/* Some kernel spawn 8 times more threads than the user is configuring.
+		 * To give the user the correct maximum work size we divide the hardware specific max by 8.
+		 */
+		MaximumWorkSize /= 8;
+	}
 	printer::inst()->print_msg(L1,"Device %lu work size %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
+
+	if(ctx->workSize > MaximumWorkSize)
+	{
+		ctx->workSize = MaximumWorkSize;
+		printer::inst()->print_msg(L1,"Device %lu work size to large, reduce to %lu / %lu.", ctx->deviceIdx, ctx->workSize, MaximumWorkSize);
+	}
+
+	const std::string backendName = xmrstak::params::inst().openCLVendor;
+	if( (ctx->stridedIndex == 2 || ctx->stridedIndex == 3) && (ctx->rawIntensity % ctx->workSize) != 0)
+	{
+		size_t reduced_intensity = (ctx->rawIntensity / ctx->workSize) * ctx->workSize;
+		ctx->rawIntensity = reduced_intensity;
+		printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx->deviceIdx, int(reduced_intensity));
+	}
+
 #if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2)
 	const cl_queue_properties CommandQueueProperties[] = { 0, 0, 0 };
 	ctx->CommandQueues = clCreateCommandQueueWithProperties(opencl_ctx, ctx->DeviceID, CommandQueueProperties, &ret);
@@ -316,10 +206,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	size_t scratchPadSize = std::max(
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-	);
+	size_t scratchPadSize = 0;
+	for(const auto algo : neededAlgorithms)
+	{
+		scratchPadSize = std::max(scratchPadSize, algo.Mem());
+	}
 
 	size_t g_thd = ctx->rawIntensity;
 	ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, scratchPadSize * g_thd, NULL, &ret);
@@ -390,18 +281,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
-	xmrstak_algo miner_algo[2] = {
-		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo(),
-		::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot()
-	};
-	int num_algos = miner_algo[0] == miner_algo[1] ? 1 : 2;
-
-	for(int ii = 0; ii < num_algos; ++ii)
+	for(const auto miner_algo : neededAlgorithms)
 	{
 		// scratchpad size for the selected mining algorithm
-		size_t hashMemSize = cn_select_memory(miner_algo[ii]);
-		int threadMemMask = cn_select_mask(miner_algo[ii]);
-		int hashIterations = cn_select_iter(miner_algo[ii]);
+		size_t hashMemSize = miner_algo.Mem();
+		int threadMemMask = miner_algo.Mask();
+		int hashIterations = miner_algo.Iter();
 
 		size_t mem_chunk_exp = 1u << ctx->memChunk;
 		size_t strided_index = ctx->stridedIndex;
@@ -409,7 +294,20 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		 * this is required if the dev pool is mining monero
 		 * but the user tuned there settings for another currency
 		 */
-		if(miner_algo[ii] == cryptonight_monero_v8)
+		if(miner_algo == cryptonight_monero_v8)
+		{
+			if(ctx->memChunk < 2)
+				mem_chunk_exp = 1u << 2;
+			if(strided_index == 1)
+				strided_index = 0;
+		}
+
+		if(miner_algo == cryptonight_gpu)
+		{
+			strided_index = 0;
+		}
+
+		if(miner_algo == cryptonight_r || miner_algo == cryptonight_r_wow)
 		{
 			if(ctx->memChunk < 2)
 				mem_chunk_exp = 1u << 2;
@@ -428,7 +326,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp) + "U";
 		options += " -DCOMP_MODE=" + std::to_string(needCompMode);
 		options += " -DMEMORY=" + std::to_string(hashMemSize) + "LU";
-		options += " -DALGO=" + std::to_string(miner_algo[ii]);
+		options += " -DALGO=" + std::to_string(miner_algo.Id());
 		options += " -DCN_UNROLL=" + std::to_string(ctx->unroll);
 		/* AMD driver output is something like: `1445.5 (VM)`
 		 * and is mapped to `14` only. The value is only used for a compiler
@@ -436,6 +334,9 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		 */
 		options += " -DOPENCL_DRIVER_MAJOR=" + std::to_string(std::stoi(openCLDriverVer.data()) / 100);
 
+		if(miner_algo == cryptonight_gpu)
+			options += " -cl-fp32-correctly-rounded-divide-sqrt";
+
 		/* create a hash for the compile time cache
 		 * used data:
 		 *   - source code
@@ -457,20 +358,20 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		{
 			if(xmrstak::params::inst().AMDCache)
 				printer::inst()->print_msg(L1,"OpenCL device %u - Precompiled code %s not found. Compiling ...",ctx->deviceIdx, cache_file.c_str());
-			ctx->Program[ii] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
+			ctx->Program[miner_algo] = clCreateProgramWithSource(opencl_ctx, 1, (const char**)&source_code, NULL, &ret);
 			if(ret != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithSource on the OpenCL miner code", err_to_str(ret));
 				return ERR_OCL_API;
 			}
 
-			ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, options.c_str(), NULL, NULL);
+			ret = clBuildProgram(ctx->Program[miner_algo], 1, &ctx->DeviceID, options.c_str(), NULL, NULL);
 			if(ret != CL_SUCCESS)
 			{
 				size_t len;
 				printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram.", err_to_str(ret));
 
-				if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
+				if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)) != CL_SUCCESS)
 				{
 					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for length of build log output.", err_to_str(ret));
 					return ERR_OCL_API;
@@ -479,7 +380,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				char* BuildLog = (char*)malloc(len + 1);
 				BuildLog[0] = '\0';
 
-				if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
+				if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_LOG, len, BuildLog, NULL)) != CL_SUCCESS)
 				{
 					free(BuildLog);
 					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for build log.", err_to_str(ret));
@@ -494,11 +395,11 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			}
 
 			cl_uint num_devices;
-			clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL);
+			clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices,NULL);
 
 
 			std::vector<cl_device_id> devices_ids(num_devices);
-			clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL);
+			clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_DEVICES, sizeof(cl_device_id)* devices_ids.size(), devices_ids.data(),NULL);
 			int dev_id = 0;
 			/* Search for the gpu within the program context.
 			 * The id can be different to  ctx->DeviceID.
@@ -513,7 +414,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			cl_build_status status;
 			do
 			{
-				if((ret = clGetProgramBuildInfo(ctx->Program[ii], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
+				if((ret = clGetProgramBuildInfo(ctx->Program[miner_algo], ctx->DeviceID, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL)) != CL_SUCCESS)
 				{
 					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramBuildInfo for status of build.", err_to_str(ret));
 					return ERR_OCL_API;
@@ -525,7 +426,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			if(xmrstak::params::inst().AMDCache)
 			{
 				std::vector<size_t> binary_sizes(num_devices);
-				clGetProgramInfo (ctx->Program[ii], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
+				clGetProgramInfo (ctx->Program[miner_algo], CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * binary_sizes.size(), binary_sizes.data(), NULL);
 
 				std::vector<char*> all_programs(num_devices);
 				std::vector<std::vector<char>> program_storage;
@@ -541,7 +442,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 					p_id++;
 				}
 
-				if((ret = clGetProgramInfo(ctx->Program[ii], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS)
+				if((ret = clGetProgramInfo(ctx->Program[miner_algo], CL_PROGRAM_BINARIES, num_devices * sizeof(char*), all_programs.data(),NULL)) != CL_SUCCESS)
 				{
 					printer::inst()->print_msg(L1,"Error %s when calling clGetProgramInfo.", err_to_str(ret));
 					return ERR_OCL_API;
@@ -565,7 +466,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			auto data_ptr = s.data();
 
 			cl_int clStatus;
-			ctx->Program[ii] = clCreateProgramWithBinary(
+			ctx->Program[miner_algo] = clCreateProgramWithBinary(
 				opencl_ctx, 1, &ctx->DeviceID, &bin_size,
 				(const unsigned char **)&data_ptr, &clStatus, &ret
 			);
@@ -574,7 +475,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				printer::inst()->print_msg(L1,"Error %s when calling clCreateProgramWithBinary. Try to delete file %s", err_to_str(ret), cache_file.c_str());
 				return ERR_OCL_API;
 			}
-			ret = clBuildProgram(ctx->Program[ii], 1, &ctx->DeviceID, NULL, NULL, NULL);
+			ret = clBuildProgram(ctx->Program[miner_algo], 1, &ctx->DeviceID, NULL, NULL, NULL);
 			if(ret != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"Error %s when calling clBuildProgram. Try to delete file %s", err_to_str(ret), cache_file.c_str());
@@ -582,40 +483,35 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			}
 		}
 
-		std::vector<std::string> KernelNames = { "cn0", "cn1", "cn2", "Blake", "Groestl", "JH", "Skein" };
+		std::vector<std::string> KernelNames = { "cn2", "Blake", "Groestl", "JH", "Skein" };
+		if(miner_algo == cryptonight_gpu)
+		{
+			KernelNames.insert(KernelNames.begin(), "cn1_cn_gpu");
+			KernelNames.insert(KernelNames.begin(), "cn0_cn_gpu");
+		}
+		else
+		{
+			KernelNames.insert(KernelNames.begin(), "cn1");
+			KernelNames.insert(KernelNames.begin(), "cn0");
+		}
+
 		// append algorithm number to kernel name
 		for(int k = 0; k < 3; k++)
-			KernelNames[k] += std::to_string(miner_algo[ii]);
+			KernelNames[k] += std::to_string(miner_algo);
 
-		if(ii == 0)
+		if(miner_algo == cryptonight_gpu)
 		{
-			for(int i = 0; i < 7; ++i)
-			{
-				ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret);
-				if(ret != CL_SUCCESS)
-				{
-					printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str());
-					return ERR_OCL_API;
-				}
-			}
+			KernelNames.push_back(std::string("cn00_cn_gpu") + std::to_string(miner_algo));
 		}
-		else
+
+		for(int i = 0; i < KernelNames.size(); ++i)
 		{
-			for(int i = 0; i < 3; ++i)
-			{
-				ctx->Kernels[ii][i] = clCreateKernel(ctx->Program[ii], KernelNames[i].c_str(), &ret);
-				if(ret != CL_SUCCESS)
-				{
-					printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_1 %s.", err_to_str(ret), KernelNames[i].c_str());
-					return ERR_OCL_API;
-				}
-			}
-			// move kernel from the main algorithm into the root algorithm kernel space
-			for(int i = 3; i < 7; ++i)
+			ctx->Kernels[miner_algo][i] = clCreateKernel(ctx->Program[miner_algo], KernelNames[i].c_str(), &ret);
+			if(ret != CL_SUCCESS)
 			{
-				ctx->Kernels[ii][i] = ctx->Kernels[0][i];
+				printer::inst()->print_msg(L1,"Error %s when calling clCreateKernel for kernel_0 %s.", err_to_str(ret), KernelNames[i].c_str());
+				return ERR_OCL_API;
 			}
-
 		}
 	}
 	ctx->Nonce = 0;
@@ -830,8 +726,6 @@ int getAMDPlatformIdx()
 // Returns 0 on success, -1 on stupid params, -2 on OpenCL API error
 size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 {
-
-	cl_context opencl_ctx;
 	cl_int ret;
 	cl_uint entries;
 
@@ -910,15 +804,6 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 		TempDeviceList[i] = DeviceIDList[ctx[i].deviceIdx];
 	}
 
-	opencl_ctx = clCreateContext(NULL, num_gpus, TempDeviceList, NULL, NULL, &ret);
-	if(ret != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	//char* source_code = LoadTextFile(sSourcePath);
-
 	const char *fastIntMathV2CL =
 			#include "./opencl/fast_int_math_v2.cl"
 	;
@@ -943,6 +828,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	const char *wolfSkeinCL =
 			#include "./opencl/wolf-skein.cl"
 	;
+	const char *cryptonight_gpu =
+			#include "./opencl/cryptonight_gpu.cl"
+	;
 
 	std::string source_code(cryptonightCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
@@ -952,12 +840,27 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_BLAKE256"), blake256CL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_GROESTL256"), groestl256CL);
+	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_CN_GPU"), cryptonight_gpu);
 
 	// create a directory  for the OpenCL compile cache
 	create_directory(get_home() + "/.openclcache");
 
 	std::vector<std::shared_ptr<InterleaveData>> interleaveData(num_gpus, nullptr);
 
+	std::vector<cl_context> context_vec(entries, nullptr);
+	for(int i = 0; i < num_gpus; ++i)
+	{
+		if(context_vec[ctx[i].deviceIdx] == nullptr)
+		{
+			context_vec[ctx[i].deviceIdx] = clCreateContext(NULL, 1, &(ctx[i].DeviceID), NULL, NULL, &ret);
+			if(ret != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret));
+				return ERR_OCL_API;
+			}
+		}
+	}
+
 	for(int i = 0; i < num_gpus; ++i)
 	{
 		const size_t devIdx = ctx[i].deviceIdx;
@@ -976,16 +879,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 		ctx[i].interleaveData = interleaveData[devIdx];
 		ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave)/100.0;
 		ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold;
+		ctx[i].opencl_ctx = context_vec[ctx[i].deviceIdx];
 
-		const std::string backendName = xmrstak::params::inst().openCLVendor;
-		if( (ctx[i].stridedIndex == 2 || ctx[i].stridedIndex == 3) && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
-		{
-			size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
-			ctx[i].rawIntensity = reduced_intensity;
-			printer::inst()->print_msg(L0, "WARNING %s: gpu %d intensity is not a multiple of 'worksize', auto reduce intensity to %d", backendName.c_str(), ctx[i].deviceIdx, int(reduced_intensity));
-		}
-
-		if((ret = InitOpenCLGpu(opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
+		if((ret = InitOpenCLGpu(ctx->opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
 		{
 			return ret;
 		}
@@ -994,10 +890,10 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	return ERR_SUCCESS;
 }
 
-size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo)
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, const xmrstak_algo& miner_algo, uint64_t height)
 {
-	// switch to the kernel storage
-	int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1;
+
+	auto & Kernels = ctx->Kernels[miner_algo.Id()];
 
 	cl_int ret;
 
@@ -1015,51 +911,103 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 		return ERR_OCL_API;
 	}
 
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[0], 0, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[0], 3, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
 		return(ERR_OCL_API);
 	}
 
-	// CN1 Kernel
+	if(miner_algo == cryptonight_gpu)
+	{
+		// we use an additional cn0 kernel to prepare the scratchpad
+		// Scratchpads
+		if((ret = clSetKernelArg(Kernels[7], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
+			return ERR_OCL_API;
+		}
+
+		// States
+		if((ret = clSetKernelArg(Kernels[7], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
+			return ERR_OCL_API;
+		}
+	}
+
+    // CN1 Kernel
+
+    if ((miner_algo == cryptonight_r) || (miner_algo == cryptonight_r_wow)) {
+
+        // Get new kernel
+        cl_program program = xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height);
+
+        if (program != ctx->ProgramCryptonightR) {
+            cl_int ret;
+            cl_kernel kernel = clCreateKernel(program, "cn1_cryptonight_r", &ret);
+
+            cl_kernel old_kernel = nullptr;
+            if (ret != CL_SUCCESS) {
+                printer::inst()->print_msg(LDEBUG, "CryptonightR: clCreateKernel returned error %s", err_to_str(ret));
+            }
+            else {
+                old_kernel = Kernels[1];
+                Kernels[1] = kernel;
+            }
+            ctx->ProgramCryptonightR = program;
+
+			uint32_t PRECOMPILATION_DEPTH = 4;
+
+            // Precompile next program in background
+            xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + 1, true, old_kernel);
+            for (int i = 2; i <= PRECOMPILATION_DEPTH; ++i)
+                xmrstak::amd::CryptonightR_get_program(ctx, miner_algo, height + i, true, nullptr);
+
+            printer::inst()->print_msg(LDEBUG, "Thread #%zu updated CryptonightR", ctx->deviceIdx);
+        }
+		else
+		{
+			printer::inst()->print_msg(LDEBUG, "Thread #%zu found CryptonightR", ctx->deviceIdx);
+		}
+    }
 
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[1], 2, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -1068,7 +1016,7 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	if(miner_algo == cryptonight_monero || miner_algo == cryptonight_aeon || miner_algo == cryptonight_ipbc || miner_algo == cryptonight_stellite || miner_algo == cryptonight_masari || miner_algo == cryptonight_bittube2)
 	{
 		// Input
-		if ((ret = clSetKernelArg(ctx->Kernels[kernel_storage][1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
+		if ((ret = clSetKernelArg(Kernels[1], 3, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS)
 		{
 			printer::inst()->print_msg(L1, "Error %s when calling clSetKernelArg for kernel 1, argument 4(input buffer).", err_to_str(ret));
 			return ERR_OCL_API;
@@ -1077,89 +1025,115 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 
 	// CN3 Kernel
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	// Branch 0
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	// Branch 1
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
+	if(miner_algo == cryptonight_gpu)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	// Branch 2
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
+		// Output
+		if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 2);
+			return ERR_OCL_API;
+		}
 
-	// Branch 3
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
+		// Target
+		if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), 2, 3);
+			return ERR_OCL_API;
+		}
 
-	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
-		return(ERR_OCL_API);
+		// Threads
+		if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
+			return(ERR_OCL_API);
+		}
 	}
-
-	for(int i = 0; i < 4; ++i)
-	{
-		// States
-		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	else
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
+		// Branch 0
+		if((ret = clSetKernelArg(Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
-		// Nonce buffer
-		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
+		// Branch 1
+		if((ret = clSetKernelArg(Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
-		// Output
-		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
+		// Branch 2
+		if((ret = clSetKernelArg(Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
-		// Target
-		if((ret = clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
+		// Branch 3
+		if((ret = clSetKernelArg(Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
 			return ERR_OCL_API;
 		}
 
-		if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+		// Threads
+		if((ret = clSetKernelArg(Kernels[2], 6, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
 			return(ERR_OCL_API);
 		}
+
+		for(int i = 0; i < 4; ++i)
+		{
+			// States
+			if((ret = clSetKernelArg(Kernels[i + 3], 0, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 0);
+				return ERR_OCL_API;
+			}
+
+			// Nonce buffer
+			if((ret = clSetKernelArg(Kernels[i + 3], 1, sizeof(cl_mem), ctx->ExtraBuffers + (i + 2))) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 1);
+				return ERR_OCL_API;
+			}
+
+			// Output
+			if((ret = clSetKernelArg(Kernels[i + 3], 2, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 2);
+				return ERR_OCL_API;
+			}
+
+			// Target
+			if((ret = clSetKernelArg(Kernels[i + 3], 3, sizeof(cl_ulong), &target)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
+				return ERR_OCL_API;
+			}
+
+			if((clSetKernelArg(Kernels[i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
+				return(ERR_OCL_API);
+			}
+		}
 	}
 
 	return ERR_SUCCESS;
@@ -1256,10 +1230,9 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 	return t0;
 }
 
-size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner_algo)
 {
-	// switch to the kernel storage
-	int kernel_storage = miner_algo == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ? 0 : 1;
+	const auto & Kernels = ctx->Kernels[miner_algo.Id()];
 
 	cl_int ret;
 	cl_uint zero = 0;
@@ -1294,7 +1267,7 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 	}
 
 	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 };
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0);
 		return ERR_OCL_API;
@@ -1302,25 +1275,50 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 
 	size_t tmpNonce = ctx->Nonce;
 
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+	if(miner_algo == cryptonight_gpu)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
-		return ERR_OCL_API;
+		size_t thd = 64;
+		size_t intens = g_intensity * thd;
+		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[7], 1, 0, &intens, &thd, 0, NULL, NULL)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 7);
+			return ERR_OCL_API;
+		}
+
+		size_t w_size_cn_gpu = w_size * 16;
+		size_t g_thd_cn_gpu = g_thd * 16;
+
+		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, 0, &g_thd_cn_gpu, &w_size_cn_gpu, 0, NULL, NULL)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
+			return ERR_OCL_API;
+		}
+	}
+	else
+	{
+		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1);
+			return ERR_OCL_API;
+		}
 	}
 
-	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
+	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[2], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2);
 		return ERR_OCL_API;
 	}
 
-	for(int i = 0; i < 4; ++i)
+	if(miner_algo != cryptonight_gpu)
 	{
-		size_t tmpNonce = ctx->Nonce;
-		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+		for(int i = 0; i < 4; ++i)
 		{
-			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
-			return ERR_OCL_API;
+			size_t tmpNonce = ctx->Nonce;
+			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, Kernels[i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
+				return ERR_OCL_API;
+			}
 		}
 	}
 
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 80fcbefde..ae2b506db 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -14,6 +14,8 @@
 #include <vector>
 #include <mutex>
 #include <memory>
+#include <map>
+#include <array>
 
 #define ERR_SUCCESS (0)
 #define ERR_OCL_API (2)
@@ -50,8 +52,10 @@ struct GpuContext
 	cl_mem InputBuffer;
 	cl_mem OutputBuffer;
 	cl_mem ExtraBuffers[6];
-	cl_program Program[2];
-	cl_kernel Kernels[2][8];
+	cl_context opencl_ctx = nullptr;
+	std::map<xmrstak_algo_id, cl_program> Program;
+	std::map<xmrstak_algo_id, std::array<cl_kernel,8>> Kernels;
+	cl_program ProgramCryptonightR = nullptr;
 	size_t freeMem;
 	size_t maxMemPerAlloc;
 	int computeUnits;
@@ -65,12 +69,152 @@ struct GpuContext
 
 };
 
+namespace
+{
+	const char* err_to_str(cl_int ret)
+	{
+		switch(ret)
+		{
+		case CL_SUCCESS:
+			return "CL_SUCCESS";
+		case CL_DEVICE_NOT_FOUND:
+			return "CL_DEVICE_NOT_FOUND";
+		case CL_DEVICE_NOT_AVAILABLE:
+			return "CL_DEVICE_NOT_AVAILABLE";
+		case CL_COMPILER_NOT_AVAILABLE:
+			return "CL_COMPILER_NOT_AVAILABLE";
+		case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+			return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+		case CL_OUT_OF_RESOURCES:
+			return "CL_OUT_OF_RESOURCES";
+		case CL_OUT_OF_HOST_MEMORY:
+			return "CL_OUT_OF_HOST_MEMORY";
+		case CL_PROFILING_INFO_NOT_AVAILABLE:
+			return "CL_PROFILING_INFO_NOT_AVAILABLE";
+		case CL_MEM_COPY_OVERLAP:
+			return "CL_MEM_COPY_OVERLAP";
+		case CL_IMAGE_FORMAT_MISMATCH:
+			return "CL_IMAGE_FORMAT_MISMATCH";
+		case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+			return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+		case CL_BUILD_PROGRAM_FAILURE:
+			return "CL_BUILD_PROGRAM_FAILURE";
+		case CL_MAP_FAILURE:
+			return "CL_MAP_FAILURE";
+		case CL_MISALIGNED_SUB_BUFFER_OFFSET:
+			return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+		case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
+			return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+	#ifdef CL_VERSION_1_2
+		case CL_COMPILE_PROGRAM_FAILURE:
+			return "CL_COMPILE_PROGRAM_FAILURE";
+		case CL_LINKER_NOT_AVAILABLE:
+			return "CL_LINKER_NOT_AVAILABLE";
+		case CL_LINK_PROGRAM_FAILURE:
+			return "CL_LINK_PROGRAM_FAILURE";
+		case CL_DEVICE_PARTITION_FAILED:
+			return "CL_DEVICE_PARTITION_FAILED";
+		case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
+			return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+	#endif
+		case CL_INVALID_VALUE:
+			return "CL_INVALID_VALUE";
+		case CL_INVALID_DEVICE_TYPE:
+			return "CL_INVALID_DEVICE_TYPE";
+		case CL_INVALID_PLATFORM:
+			return "CL_INVALID_PLATFORM";
+		case CL_INVALID_DEVICE:
+			return "CL_INVALID_DEVICE";
+		case CL_INVALID_CONTEXT:
+			return "CL_INVALID_CONTEXT";
+		case CL_INVALID_QUEUE_PROPERTIES:
+			return "CL_INVALID_QUEUE_PROPERTIES";
+		case CL_INVALID_COMMAND_QUEUE:
+			return "CL_INVALID_COMMAND_QUEUE";
+		case CL_INVALID_HOST_PTR:
+			return "CL_INVALID_HOST_PTR";
+		case CL_INVALID_MEM_OBJECT:
+			return "CL_INVALID_MEM_OBJECT";
+		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+			return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+		case CL_INVALID_IMAGE_SIZE:
+			return "CL_INVALID_IMAGE_SIZE";
+		case CL_INVALID_SAMPLER:
+			return "CL_INVALID_SAMPLER";
+		case CL_INVALID_BINARY:
+			return "CL_INVALID_BINARY";
+		case CL_INVALID_BUILD_OPTIONS:
+			return "CL_INVALID_BUILD_OPTIONS";
+		case CL_INVALID_PROGRAM:
+			return "CL_INVALID_PROGRAM";
+		case CL_INVALID_PROGRAM_EXECUTABLE:
+			return "CL_INVALID_PROGRAM_EXECUTABLE";
+		case CL_INVALID_KERNEL_NAME:
+			return "CL_INVALID_KERNEL_NAME";
+		case CL_INVALID_KERNEL_DEFINITION:
+			return "CL_INVALID_KERNEL_DEFINITION";
+		case CL_INVALID_KERNEL:
+			return "CL_INVALID_KERNEL";
+		case CL_INVALID_ARG_INDEX:
+			return "CL_INVALID_ARG_INDEX";
+		case CL_INVALID_ARG_VALUE:
+			return "CL_INVALID_ARG_VALUE";
+		case CL_INVALID_ARG_SIZE:
+			return "CL_INVALID_ARG_SIZE";
+		case CL_INVALID_KERNEL_ARGS:
+			return "CL_INVALID_KERNEL_ARGS";
+		case CL_INVALID_WORK_DIMENSION:
+			return "CL_INVALID_WORK_DIMENSION";
+		case CL_INVALID_WORK_GROUP_SIZE:
+			return "CL_INVALID_WORK_GROUP_SIZE";
+		case CL_INVALID_WORK_ITEM_SIZE:
+			return "CL_INVALID_WORK_ITEM_SIZE";
+		case CL_INVALID_GLOBAL_OFFSET:
+			return "CL_INVALID_GLOBAL_OFFSET";
+		case CL_INVALID_EVENT_WAIT_LIST:
+			return "CL_INVALID_EVENT_WAIT_LIST";
+		case CL_INVALID_EVENT:
+			return "CL_INVALID_EVENT";
+		case CL_INVALID_OPERATION:
+			return "CL_INVALID_OPERATION";
+		case CL_INVALID_GL_OBJECT:
+			return "CL_INVALID_GL_OBJECT";
+		case CL_INVALID_BUFFER_SIZE:
+			return "CL_INVALID_BUFFER_SIZE";
+		case CL_INVALID_MIP_LEVEL:
+			return "CL_INVALID_MIP_LEVEL";
+		case CL_INVALID_GLOBAL_WORK_SIZE:
+			return "CL_INVALID_GLOBAL_WORK_SIZE";
+		case CL_INVALID_PROPERTY:
+			return "CL_INVALID_PROPERTY";
+	#ifdef CL_VERSION_1_2
+		case CL_INVALID_IMAGE_DESCRIPTOR:
+			return "CL_INVALID_IMAGE_DESCRIPTOR";
+		case CL_INVALID_COMPILER_OPTIONS:
+			return "CL_INVALID_COMPILER_OPTIONS";
+		case CL_INVALID_LINKER_OPTIONS:
+			return "CL_INVALID_LINKER_OPTIONS";
+		case CL_INVALID_DEVICE_PARTITION_COUNT:
+			return "CL_INVALID_DEVICE_PARTITION_COUNT";
+	#endif
+	#if defined(CL_VERSION_2_0) && !defined(CONF_ENFORCE_OpenCL_1_2)
+		case CL_INVALID_PIPE_SIZE:
+			return "CL_INVALID_PIPE_SIZE";
+		case CL_INVALID_DEVICE_QUEUE:
+			return "CL_INVALID_DEVICE_QUEUE";
+	#endif
+		default:
+			return "UNKNOWN_ERROR";
+		}
+	}
+}
+
 uint32_t getNumPlatforms();
 int getAMDPlatformIdx();
 std::vector<GpuContext> getAMDDevices(int index);
 
 size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx);
-size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo);
-size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo);
+size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, const xmrstak_algo& miner_algo, uint64_t height);
+size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, const xmrstak_algo& miner_algo);
 uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment = true);
 uint64_t updateTimings(GpuContext* ctx, const uint64_t t);
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 6a3def72c..2ca09c31c 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -14,6 +14,23 @@ R"===(
   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
   */
 
+// defines to translate algorithm names int a same number used within cryptonight.h
+#define invalid_algo 0
+#define cryptonight 1
+#define cryptonight_lite 2
+#define cryptonight_monero 3
+#define cryptonight_heavy 4
+#define cryptonight_aeon 5
+#define cryptonight_ipbc 6
+#define cryptonight_stellite 7
+#define cryptonight_masari 8
+#define cryptonight_haven 9
+#define cryptonight_bittube2 10
+#define cryptonight_monero_v8 11
+#define cryptonight_superfast 12
+#define cryptonight_gpu 13
+#define cryptonight_conceal 14
+
 /* For Mesa clover support */
 #ifdef cl_clang_storage_class_specifiers
 #   pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
@@ -34,10 +51,10 @@ R"===(
  */
 inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
 {
-    uint2 result;
+	uint2 result;
 	result.s0 =  (uint) (((((ulong)src0.s0) << 32) | (ulong)src1.s0) >> (src2));
 	result.s1 =  (uint) (((((ulong)src0.s1) << 32) | (ulong)src1.s1) >> (src2));
-    return result;
+	return result;
 }
 #endif
 
@@ -61,23 +78,278 @@ inline uint2 amd_bitalign( const uint2 src0, const uint2 src1, const uint src2)
  */
 inline int amd_bfe(const uint src0, const uint offset, const uint width)
 {
-    /* casts are removed because we can implement everything as uint
-     * int offset = src1;
-     * int width = src2;
-     * remove check for edge case, this function is always called with
-     * `width==8`
-     * @code
-     *   if ( width == 0 )
-     *      return 0;
-     * @endcode
-     */
-    if ( (offset + width) < 32u )
-        return (src0 << (32u - offset - width)) >> (32u - width);
-
-    return src0 >> offset;
+	/* casts are removed because we can implement everything as uint
+	 * int offset = src1;
+	 * int width = src2;
+	 * remove check for edge case, this function is always called with
+	 * `width==8`
+	 * @code
+	 *   if ( width == 0 )
+	 *      return 0;
+	 * @endcode
+	 */
+	if ( (offset + width) < 32u )
+		return (src0 << (32u - offset - width)) >> (32u - width);
+
+	return src0 >> offset;
+}
+#endif
+
+static const __constant ulong keccakf_rndc[24] =
+{
+	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+};
+
+static const __constant uchar sbox[256] =
+{
+	0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
+	0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
+	0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+	0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
+	0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
+	0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+	0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
+	0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
+	0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+	0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
+	0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
+	0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+	0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
+	0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
+	0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+	0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
+};
+
+
+void keccakf1600(ulong *s)
+{
+	for(int i = 0; i < 24; ++i)
+	{
+		ulong bc[5], tmp1, tmp2;
+		bc[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20] ^ rotate(s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22], 1UL);
+		bc[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21] ^ rotate(s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23], 1UL);
+		bc[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22] ^ rotate(s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24], 1UL);
+		bc[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23] ^ rotate(s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20], 1UL);
+		bc[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24] ^ rotate(s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21], 1UL);
+
+		tmp1 = s[1] ^ bc[0];
+
+		s[0] ^= bc[4];
+		s[1] = rotate(s[6] ^ bc[0], 44UL);
+		s[6] = rotate(s[9] ^ bc[3], 20UL);
+		s[9] = rotate(s[22] ^ bc[1], 61UL);
+		s[22] = rotate(s[14] ^ bc[3], 39UL);
+		s[14] = rotate(s[20] ^ bc[4], 18UL);
+		s[20] = rotate(s[2] ^ bc[1], 62UL);
+		s[2] = rotate(s[12] ^ bc[1], 43UL);
+		s[12] = rotate(s[13] ^ bc[2], 25UL);
+		s[13] = rotate(s[19] ^ bc[3], 8UL);
+		s[19] = rotate(s[23] ^ bc[2], 56UL);
+		s[23] = rotate(s[15] ^ bc[4], 41UL);
+		s[15] = rotate(s[4] ^ bc[3], 27UL);
+		s[4] = rotate(s[24] ^ bc[3], 14UL);
+		s[24] = rotate(s[21] ^ bc[0], 2UL);
+		s[21] = rotate(s[8] ^ bc[2], 55UL);
+		s[8] = rotate(s[16] ^ bc[0], 35UL);
+		s[16] = rotate(s[5] ^ bc[4], 36UL);
+		s[5] = rotate(s[3] ^ bc[2], 28UL);
+		s[3] = rotate(s[18] ^ bc[2], 21UL);
+		s[18] = rotate(s[17] ^ bc[1], 15UL);
+		s[17] = rotate(s[11] ^ bc[0], 10UL);
+		s[11] = rotate(s[7] ^ bc[1], 6UL);
+		s[7] = rotate(s[10] ^ bc[4], 3UL);
+		s[10] = rotate(tmp1, 1UL);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0] ^= keccakf_rndc[i];
+	}
+}
+
+static const __constant uint keccakf_rotc[24] =
+{
+	1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
+	27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
+};
+
+static const __constant uint keccakf_piln[24] =
+{
+	10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
+	15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
+};
+
+inline void keccakf1600_1(ulong st[25])
+{
+	int i, round;
+	ulong t, bc[5];
+
+	#pragma unroll 1
+	for (round = 0; round < 24; ++round)
+	{
+		bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
+		bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
+		bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
+		bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
+		bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
+
+		st[0] ^= bc[4];
+		st[5] ^= bc[4];
+		st[10] ^= bc[4];
+		st[15] ^= bc[4];
+		st[20] ^= bc[4];
+
+		st[1] ^= bc[0];
+		st[6] ^= bc[0];
+		st[11] ^= bc[0];
+		st[16] ^= bc[0];
+		st[21] ^= bc[0];
+
+		st[2] ^= bc[1];
+		st[7] ^= bc[1];
+		st[12] ^= bc[1];
+		st[17] ^= bc[1];
+		st[22] ^= bc[1];
+
+		st[3] ^= bc[2];
+		st[8] ^= bc[2];
+		st[13] ^= bc[2];
+		st[18] ^= bc[2];
+		st[23] ^= bc[2];
+
+		st[4] ^= bc[3];
+		st[9] ^= bc[3];
+		st[14] ^= bc[3];
+		st[19] ^= bc[3];
+		st[24] ^= bc[3];
+
+		// Rho Pi
+		t = st[1];
+		#pragma unroll
+		for (i = 0; i < 24; ++i) {
+			bc[0] = st[keccakf_piln[i]];
+			st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
+			t = bc[0];
+		}
+
+		#pragma unroll
+		for(int i = 0; i < 25; i += 5)
+		{
+			ulong tmp1 = st[i], tmp2 = st[i + 1];
+
+			st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
+			st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]);
+			st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]);
+			st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]);
+			st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1);
+		}
+
+		//  Iota
+		st[0] ^= keccakf_rndc[round];
+	}
 }
+)==="
+R"===(
+
+void keccakf1600_2(__local ulong *st)
+{
+	int i, round;
+	ulong t, bc[5];
+
+	#pragma unroll 1
+	for (round = 0; round < 24; ++round)
+	{
+		bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
+		bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
+		bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
+		bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
+		bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
+
+		st[0] ^= bc[4];
+		st[5] ^= bc[4];
+		st[10] ^= bc[4];
+		st[15] ^= bc[4];
+		st[20] ^= bc[4];
+
+		st[1] ^= bc[0];
+		st[6] ^= bc[0];
+		st[11] ^= bc[0];
+		st[16] ^= bc[0];
+		st[21] ^= bc[0];
+
+		st[2] ^= bc[1];
+		st[7] ^= bc[1];
+		st[12] ^= bc[1];
+		st[17] ^= bc[1];
+		st[22] ^= bc[1];
+
+		st[3] ^= bc[2];
+		st[8] ^= bc[2];
+		st[13] ^= bc[2];
+		st[18] ^= bc[2];
+		st[23] ^= bc[2];
+
+		st[4] ^= bc[3];
+		st[9] ^= bc[3];
+		st[14] ^= bc[3];
+		st[19] ^= bc[3];
+		st[24] ^= bc[3];
+
+		// Rho Pi
+		t = st[1];
+		#pragma unroll
+		for (i = 0; i < 24; ++i) {
+			bc[0] = st[keccakf_piln[i]];
+			st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
+			t = bc[0];
+		}
+
+		#pragma unroll
+		for(int i = 0; i < 25; i += 5)
+		{
+			ulong tmp1 = st[i], tmp2 = st[i + 1];
+
+			st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
+			st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]);
+			st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]);
+			st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]);
+			st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1);
+		}
+
+		//  Iota
+		st[0] ^= keccakf_rndc[round];
+	}
+}
+
+#define MEM_CHUNK (1<<MEM_CHUNK_EXPONENT)
+
+#if(STRIDED_INDEX==0)
+#   define IDX(x)	(x)
+#elif(STRIDED_INDEX==1)
+#	define IDX(x)   (mul24(((uint)(x)), Threads))
+#elif(STRIDED_INDEX==2)
+#   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
+#elif(STRIDED_INDEX==3)
+#	define IDX(x)   ((x) * WORKSIZE)
 #endif
 
+#define JOIN_DO(x,y) x##y
+#define JOIN(x,y) JOIN_DO(x,y)
+
+inline uint getIdx()
+{
+	return get_global_id(0) - get_global_offset(0);
+}
+
 //#include "opencl/fast_int_math_v2.cl"
 XMRSTAK_INCLUDE_FAST_INT_MATH_V2
 //#include "fast_div_heavy.cl"
@@ -93,247 +365,97 @@ XMRSTAK_INCLUDE_BLAKE256
 //#include "opencl/groestl256.cl"
 XMRSTAK_INCLUDE_GROESTL256
 
-static const __constant ulong keccakf_rndc[24] =
+inline float4 _mm_add_ps(float4 a, float4 b)
 {
-    0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
-    0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
-    0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
-    0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
-    0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
-    0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
-    0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
-    0x8000000000008080, 0x0000000080000001, 0x8000000080008008
-};
+	return a + b;
+}
 
-static const __constant uchar sbox[256] =
+inline float4 _mm_sub_ps(float4 a, float4 b)
 {
-    0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
-    0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
-    0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
-    0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
-    0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
-    0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
-    0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
-    0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
-    0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
-    0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
-    0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
-    0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
-    0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
-    0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
-    0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
-    0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
-};
+	return a - b;
+}
 
+inline float4 _mm_mul_ps(float4 a, float4 b)
+{
 
-void keccakf1600(ulong *s)
+	//#pragma OPENCL SELECT_ROUNDING_MODE rte
+	return a * b;
+}
+
+inline float4 _mm_div_ps(float4 a, float4 b)
 {
-    for(int i = 0; i < 24; ++i)
-    {
-        ulong bc[5], tmp1, tmp2;
-        bc[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20] ^ rotate(s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22], 1UL);
-        bc[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21] ^ rotate(s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23], 1UL);
-        bc[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22] ^ rotate(s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24], 1UL);
-        bc[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23] ^ rotate(s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20], 1UL);
-        bc[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24] ^ rotate(s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21], 1UL);
-
-        tmp1 = s[1] ^ bc[0];
-
-        s[0] ^= bc[4];
-        s[1] = rotate(s[6] ^ bc[0], 44UL);
-        s[6] = rotate(s[9] ^ bc[3], 20UL);
-        s[9] = rotate(s[22] ^ bc[1], 61UL);
-        s[22] = rotate(s[14] ^ bc[3], 39UL);
-        s[14] = rotate(s[20] ^ bc[4], 18UL);
-        s[20] = rotate(s[2] ^ bc[1], 62UL);
-        s[2] = rotate(s[12] ^ bc[1], 43UL);
-        s[12] = rotate(s[13] ^ bc[2], 25UL);
-        s[13] = rotate(s[19] ^ bc[3], 8UL);
-        s[19] = rotate(s[23] ^ bc[2], 56UL);
-        s[23] = rotate(s[15] ^ bc[4], 41UL);
-        s[15] = rotate(s[4] ^ bc[3], 27UL);
-        s[4] = rotate(s[24] ^ bc[3], 14UL);
-        s[24] = rotate(s[21] ^ bc[0], 2UL);
-        s[21] = rotate(s[8] ^ bc[2], 55UL);
-        s[8] = rotate(s[16] ^ bc[0], 35UL);
-        s[16] = rotate(s[5] ^ bc[4], 36UL);
-        s[5] = rotate(s[3] ^ bc[2], 28UL);
-        s[3] = rotate(s[18] ^ bc[2], 21UL);
-        s[18] = rotate(s[17] ^ bc[1], 15UL);
-        s[17] = rotate(s[11] ^ bc[0], 10UL);
-        s[11] = rotate(s[7] ^ bc[1], 6UL);
-        s[7] = rotate(s[10] ^ bc[4], 3UL);
-        s[10] = rotate(tmp1, 1UL);
-
-        tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
-        tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
-        tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
-        tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
-        tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
-        s[0] ^= keccakf_rndc[i];
-    }
+	return a / b;
 }
 
-static const __constant uint keccakf_rotc[24] =
+inline float4 _mm_and_ps(float4 a, int b)
 {
-    1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
-    27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
-};
+	return as_float4(as_int4(a) & (int4)(b));
+}
 
-static const __constant uint keccakf_piln[24] =
+inline float4 _mm_or_ps(float4 a, int b)
 {
-    10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
-    15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
-};
+	return as_float4(as_int4(a) | (int4)(b));
+}
 
-void keccakf1600_1(ulong *st)
+inline float4 _mm_fmod_ps(float4 v, float dc)
 {
-    int i, round;
-    ulong t, bc[5];
-
-    #pragma unroll 1
-    for(round = 0; round < 24; ++round)
-    {
-
-        // Theta
-        bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
-        bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
-        bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
-        bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
-        bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
-
-        #pragma unroll 1
-        for (i = 0; i < 5; ++i) {
-            t = bc[(i + 4) % 5] ^ rotate(bc[(i + 1) % 5], 1UL);
-            st[i     ] ^= t;
-            st[i +  5] ^= t;
-            st[i + 10] ^= t;
-            st[i + 15] ^= t;
-            st[i + 20] ^= t;
-        }
-
-        // Rho Pi
-        t = st[1];
-        #pragma unroll
-        for (i = 0; i < 24; ++i) {
-            bc[0] = st[keccakf_piln[i]];
-            st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
-            t = bc[0];
-        }
-
-        #pragma unroll 1
-        for(int i = 0; i < 25; i += 5)
-        {
-            ulong tmp[5];
-
-            #pragma unroll 1
-            for(int x = 0; x < 5; ++x)
-                tmp[x] = bitselect(st[i + x] ^ st[i + ((x + 2) % 5)], st[i + x], st[i + ((x + 1) % 5)]);
-
-            #pragma unroll 1
-            for(int x = 0; x < 5; ++x) st[i + x] = tmp[x];
-        }
-
-        //  Iota
-        st[0] ^= keccakf_rndc[round];
-    }
+	float4 d = (float4)(dc);
+	float4 c = _mm_div_ps(v, d);
+	c = trunc(c);
+	c = _mm_mul_ps(c, d);
+	return _mm_sub_ps(v, c);
 }
-)==="
-R"===(
 
-void keccakf1600_2(__local ulong *st)
+inline int4 _mm_xor_si128(int4 a, int4 b)
+{
+	return a ^ b;
+}
+
+inline float4 _mm_xor_ps(float4 a, int b)
+{
+	return as_float4(as_int4(a) ^ (int4)(b));
+}
+
+inline int4 _mm_alignr_epi8(int4 a, const uint rot)
 {
-    int i, round;
-    ulong t, bc[5];
-
-    #pragma unroll 1
-    for (round = 0; round < 24; ++round)
-    {
-        bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20] ^ rotate(st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22], 1UL);
-        bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21] ^ rotate(st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23], 1UL);
-        bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22] ^ rotate(st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24], 1UL);
-        bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23] ^ rotate(st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20], 1UL);
-        bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24] ^ rotate(st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21], 1UL);
-
-        st[0] ^= bc[4];
-        st[5] ^= bc[4];
-        st[10] ^= bc[4];
-        st[15] ^= bc[4];
-        st[20] ^= bc[4];
-
-        st[1] ^= bc[0];
-        st[6] ^= bc[0];
-        st[11] ^= bc[0];
-        st[16] ^= bc[0];
-        st[21] ^= bc[0];
-
-        st[2] ^= bc[1];
-        st[7] ^= bc[1];
-        st[12] ^= bc[1];
-        st[17] ^= bc[1];
-        st[22] ^= bc[1];
-
-        st[3] ^= bc[2];
-        st[8] ^= bc[2];
-        st[13] ^= bc[2];
-        st[18] ^= bc[2];
-        st[23] ^= bc[2];
-
-        st[4] ^= bc[3];
-        st[9] ^= bc[3];
-        st[14] ^= bc[3];
-        st[19] ^= bc[3];
-        st[24] ^= bc[3];
-
-        // Rho Pi
-        t = st[1];
-        #pragma unroll
-        for (i = 0; i < 24; ++i) {
-            bc[0] = st[keccakf_piln[i]];
-            st[keccakf_piln[i]] = rotate(t, (ulong)keccakf_rotc[i]);
-            t = bc[0];
-        }
-
-        #pragma unroll
-        for(int i = 0; i < 25; i += 5)
-        {
-            ulong tmp1 = st[i], tmp2 = st[i + 1];
-
-            st[i] = bitselect(st[i] ^ st[i + 2], st[i], st[i + 1]);
-            st[i + 1] = bitselect(st[i + 1] ^ st[i + 3], st[i + 1], st[i + 2]);
-            st[i + 2] = bitselect(st[i + 2] ^ st[i + 4], st[i + 2], st[i + 3]);
-            st[i + 3] = bitselect(st[i + 3] ^ tmp1, st[i + 3], st[i + 4]);
-            st[i + 4] = bitselect(st[i + 4] ^ tmp2, st[i + 4], tmp1);
-        }
-
-        //  Iota
-        st[0] ^= keccakf_rndc[round];
-    }
+	const uint right = 8 * rot;
+	const uint left = (32 - 8 * rot);
+	return (int4)(
+		((uint)a.x >> right) | ( a.y << left ),
+		((uint)a.y >> right) | ( a.z << left ),
+		((uint)a.z >> right) | ( a.w << left ),
+		((uint)a.w >> right) | ( a.x << left )
+	);
 }
 
+#if (ALGO == cryptonight_gpu)
+	//#include "opencl/cryptonight_gpu.cl"
+	XMRSTAK_INCLUDE_CN_GPU
+#endif
+
 )==="
 R"===(
 
 void CNKeccak(ulong *output, ulong *input)
 {
-    ulong st[25];
+	ulong st[25];
 
-    // Copy 72 bytes
-    for(int i = 0; i < 9; ++i) st[i] = input[i];
+	// Copy 72 bytes
+	for(int i = 0; i < 9; ++i) st[i] = input[i];
 
-    // Last four and '1' bit for padding
-    //st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U));
+	// Last four and '1' bit for padding
+	//st[9] = as_ulong((uint2)(((uint *)input)[18], 0x00000001U));
 
-    st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL;
+	st[9] = (input[9] & 0x00000000FFFFFFFFUL) | 0x0000000100000000UL;
 
-    for(int i = 10; i < 25; ++i) st[i] = 0x00UL;
+	for(int i = 10; i < 25; ++i) st[i] = 0x00UL;
 
-    // Last bit of padding
-    st[16] = 0x8000000000000000UL;
+	// Last bit of padding
+	st[16] = 0x8000000000000000UL;
 
-    keccakf1600_1(st);
+	keccakf1600_1(st);
 
-    for(int i = 0; i < 25; ++i) output[i] = st[i];
+	for(int i = 0; i < 25; ++i) output[i] = st[i];
 }
 
 static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40 };
@@ -344,201 +466,180 @@ static const __constant uchar rcon[8] = { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x
 
 void AESExpandKey256(uint *keybuf)
 {
-    //#pragma unroll 4
-    for(uint c = 8, i = 1; c < 40; ++c)
-    {
-        // For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
-        uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];
-
-        // If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant,
-        // then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation
-        // is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done.
-        keybuf[c] = keybuf[c - 8] ^ ((!(c & 7)) ? rotate(t, 24U) ^ as_uint((uchar4)(rcon[i++], 0U, 0U, 0U)) : t);
-    }
+	//#pragma unroll 4
+	for(uint c = 8, i = 1; c < 40; ++c)
+	{
+		// For 256-bit keys, an sbox permutation is done every other 4th uint generated, AND every 8th
+		uint t = ((!(c & 7)) || ((c & 7) == 4)) ? SubWord(keybuf[c - 1]) : keybuf[c - 1];
+
+		// If the uint we're generating has an index that is a multiple of 8, rotate and XOR with the round constant,
+		// then XOR this with previously generated uint. If it's 4 after a multiple of 8, only the sbox permutation
+		// is done, followed by the XOR. If neither are true, only the XOR with the previously generated uint is done.
+		keybuf[c] = keybuf[c - 8] ^ ((!(c & 7)) ? rotate(t, 24U) ^ as_uint((uchar4)(rcon[i++], 0U, 0U, 0U)) : t);
+	}
 }
 
 )==="
 R"===(
 
-#define MEM_CHUNK (1<<MEM_CHUNK_EXPONENT)
-
-#if(STRIDED_INDEX==0)
-#   define IDX(x)	(x)
-#elif(STRIDED_INDEX==1)
-#	define IDX(x)   (mul24(((uint)(x)), Threads))
-#elif(STRIDED_INDEX==2)
-#   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
-#elif(STRIDED_INDEX==3)
-#	define IDX(x)   ((x) * WORKSIZE)
-#endif
-
-inline uint getIdx()
-{
-    return get_global_id(0) - get_global_offset(0);
-}
-
 #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
 
-        #define JOIN_DO(x,y) x##y
-#define JOIN(x,y) JOIN_DO(x,y)
-
 __attribute__((reqd_work_group_size(8, 8, 1)))
 __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, uint Threads)
 {
-    uint ExpandedKey1[40];
-    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
-    uint4 text;
+	uint ExpandedKey1[40];
+	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
+	uint4 text;
 
-    const uint gIdx = getIdx();
+	const uint gIdx = getIdx();
 
 	for(int i = get_local_id(1) * 8 + get_local_id(0);
 		i < 256;
 		i += 8 * 8)
 	{
-        const uint tmp = AES0_C[i];
-        AES0[i] = tmp;
-        AES1[i] = rotate(tmp, 8U);
-        AES2[i] = rotate(tmp, 16U);
-        AES3[i] = rotate(tmp, 24U);
-    }
+		const uint tmp = AES0_C[i];
+		AES0[i] = tmp;
+		AES1[i] = rotate(tmp, 8U);
+		AES2[i] = rotate(tmp, 16U);
+		AES3[i] = rotate(tmp, 24U);
+	}
 
-    __local ulong State_buf[8 * 25];
+	__local ulong State_buf[8 * 25];
 
-    barrier(CLK_LOCAL_MEM_FENCE);
+	barrier(CLK_LOCAL_MEM_FENCE);
 
 #if(COMP_MODE==1)
-    // do not use early return here
+	// do not use early return here
 	if(gIdx < Threads)
 #endif
-    {
-        states += 25 * gIdx;
+	{
+		states += 25 * gIdx;
 
 #if(STRIDED_INDEX==0)
-        Scratchpad += gIdx * (MEMORY >> 4);
+		Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
 		Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
-        Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
+		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
 #elif(STRIDED_INDEX==3)
 		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
 #endif
 
-        if (get_local_id(1) == 0)
-        {
-            __local ulong* State = State_buf + get_local_id(0) * 25;
+		if (get_local_id(1) == 0)
+		{
+			__local ulong* State = State_buf + get_local_id(0) * 25;
 // NVIDIA
 #ifdef __NV_CL_C_VERSION
 			for(uint i = 0; i < 8; ++i)
 				State[i] = input[i];
 #else
-            ((__local ulong8 *)State)[0] = vload8(0, input);
+			((__local ulong8 *)State)[0] = vload8(0, input);
 #endif
-            State[8]  = input[8];
-            State[9]  = input[9];
-            State[10] = input[10];
-
-            ((__local uint *)State)[9]  &= 0x00FFFFFFU;
-            ((__local uint *)State)[9]  |= (((uint)get_global_id(0)) & 0xFF) << 24;
-            ((__local uint *)State)[10] &= 0xFF000000U;
-            /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA)
-             * handle get_global_id and get_global_offset as signed long long int and add
-             * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1`
-             * (even if it is correct casted to unsigned on the host)
-             */
-            ((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
-
-            for (int i = 11; i < 25; ++i) {
-                State[i] = 0x00UL;
-            }
-
-            // Last bit of padding
-            State[16] = 0x8000000000000000UL;
-
-            keccakf1600_2(State);
-
-            #pragma unroll
-            for (int i = 0; i < 25; ++i) {
-                states[i] = State[i];
-            }
-        }
-    }
-
-    barrier(CLK_GLOBAL_MEM_FENCE);
+			State[8]  = input[8];
+			State[9]  = input[9];
+			State[10] = input[10];
+
+			((__local uint *)State)[9]  &= 0x00FFFFFFU;
+			((__local uint *)State)[9]  |= (((uint)get_global_id(0)) & 0xFF) << 24;
+			((__local uint *)State)[10] &= 0xFF000000U;
+			/* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA)
+			 * handle get_global_id and get_global_offset as signed long long int and add
+			 * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1`
+			 * (even if it is correct casted to unsigned on the host)
+			 */
+			((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
+
+			for (int i = 11; i < 25; ++i) {
+			    State[i] = 0x00UL;
+			}
+
+			// Last bit of padding
+			State[16] = 0x8000000000000000UL;
+
+			keccakf1600_2(State);
+
+			#pragma unroll
+			for (int i = 0; i < 25; ++i) {
+			    states[i] = State[i];
+			}
+		}
+	}
+
+	barrier(CLK_GLOBAL_MEM_FENCE);
 
 #   if (COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
+	// do not use early return here
+	if (gIdx < Threads)
 #   endif
-    {
-        text = vload4(get_local_id(1) + 4, (__global uint *)(states));
-
-        #pragma unroll
-        for (int i = 0; i < 4; ++i) {
-            ((ulong *)ExpandedKey1)[i] = states[i];
-        }
-
-        AESExpandKey256(ExpandedKey1);
-    }
-
-    mem_fence(CLK_LOCAL_MEM_FENCE);
-
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
-    __local uint4 xin[8][8];
-    {
-
-        /* Also left over threads perform this loop.
-         * The left over thread results will be ignored
-         */
-        #pragma unroll 16
-        for (size_t i = 0; i < 16; i++) {
-            #pragma unroll 10
-            for (int j = 0; j < 10; ++j) {
-                uint4 t = ((uint4 *)ExpandedKey1)[j];
-                t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)];
-                t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)];
-                t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)];
-                t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)];
-                text = t;
-            }
-
-            barrier(CLK_LOCAL_MEM_FENCE);
-            xin[get_local_id(1)][get_local_id(0)] = text;
-            barrier(CLK_LOCAL_MEM_FENCE);
-            text = mix_and_propagate(xin);
-        }
-    }
+	{
+		text = vload4(get_local_id(1) + 4, (__global uint *)(states));
+
+		#pragma unroll
+		for (int i = 0; i < 4; ++i) {
+			((ulong *)ExpandedKey1)[i] = states[i];
+		}
+
+		AESExpandKey256(ExpandedKey1);
+	}
+
+	mem_fence(CLK_LOCAL_MEM_FENCE);
+
+#if (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+	__local uint4 xin[8][8];
+	{
+
+		/* Also left over threads perform this loop.
+		 * The left over thread results will be ignored
+		 */
+		#pragma unroll 16
+		for (size_t i = 0; i < 16; i++) {
+			#pragma unroll 10
+			for (int j = 0; j < 10; ++j) {
+			    uint4 t = ((uint4 *)ExpandedKey1)[j];
+			    t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)];
+			    t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)];
+			    t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)];
+			    t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)];
+			    text = t;
+			}
+
+			barrier(CLK_LOCAL_MEM_FENCE);
+			xin[get_local_id(1)][get_local_id(0)] = text;
+			barrier(CLK_LOCAL_MEM_FENCE);
+			text = mix_and_propagate(xin);
+		}
+	}
 #endif
 
 #if(COMP_MODE==1)
-    // do not use early return here
+	// do not use early return here
 	if(gIdx < Threads)
 #endif
-    {
-
-        #pragma unroll 2
-        for(int i = 0; i < (MEMORY >> 4); i += 8) {
-            #pragma unroll 10
-            for (int j = 0; j < 10; ++j) {
-                uint4 t = ((uint4 *)ExpandedKey1)[j];
-                t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)];
-                t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)];
-                t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)];
-                t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)];
-                text = t;
-            }
-
-            Scratchpad[IDX(i + get_local_id(1))] = text;
-        }
-    }
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
+	{
+
+		#pragma unroll 2
+		for(int i = 0; i < (MEMORY >> 4); i += 8) {
+			#pragma unroll 10
+			for (int j = 0; j < 10; ++j) {
+			    uint4 t = ((uint4 *)ExpandedKey1)[j];
+			    t.s0 ^= AES0[BYTE(text.s0, 0)] ^ AES1[BYTE(text.s1, 1)] ^ AES2[BYTE(text.s2, 2)] ^ AES3[BYTE(text.s3, 3)];
+			    t.s1 ^= AES0[BYTE(text.s1, 0)] ^ AES1[BYTE(text.s2, 1)] ^ AES2[BYTE(text.s3, 2)] ^ AES3[BYTE(text.s0, 3)];
+			    t.s2 ^= AES0[BYTE(text.s2, 0)] ^ AES1[BYTE(text.s3, 1)] ^ AES2[BYTE(text.s0, 2)] ^ AES3[BYTE(text.s1, 3)];
+			    t.s3 ^= AES0[BYTE(text.s3, 0)] ^ AES1[BYTE(text.s0, 1)] ^ AES2[BYTE(text.s1, 2)] ^ AES3[BYTE(text.s2, 3)];
+			    text = t;
+			}
+
+			Scratchpad[IDX(i + get_local_id(1))] = text;
+		}
+	}
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
 )==="
 R"===(
 
-// cryptonight_monero_v8 && NVIDIA
-#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+// __NV_CL_C_VERSION checks if NVIDIA opencl is used
+#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
 #	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
 #	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
 #else
@@ -547,16 +648,18 @@ R"===(
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states, uint Threads
-// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
-#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
+
+#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
 , __global ulong *input
 #endif
 )
 {
 	ulong a[2];
+#if(ALGO == cryptonight_conceal)
+	float4 conc_var = (float4)(0.0f);
+#endif
 
-// cryptonight_monero_v8
-#if(ALGO==11)
+#if(ALGO == cryptonight_monero_v8)
 	ulong b[4];
 	uint4 b_x[2];
 // NVIDIA
@@ -568,123 +671,134 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	ulong b[2];
 	uint4 b_x[1];
 #endif
-    __local uint AES0[256], AES1[256];
+	__local uint AES0[256], AES1[256];
 
-// cryptonight_monero_v8
-#if(ALGO==11)
+#if(ALGO == cryptonight_monero_v8)
 #	if defined(__clang__) && !defined(__NV_CL_C_VERSION)
-    __local uint RCP[256];
+	__local uint RCP[256];
 #	endif
 
 	uint2 division_result;
 	uint sqrt_result;
 #endif
-    const uint gIdx = getIdx();
+	const uint gIdx = getIdx();
 
 	for(int i = get_local_id(0); i < 256; i += WORKSIZE)
 	{
-        const uint tmp = AES0_C[i];
-        AES0[i] = tmp;
-        AES1[i] = rotate(tmp, 8U);
-// cryptonight_monero_v8
-#if(ALGO==11 && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
+		const uint tmp = AES0_C[i];
+		AES0[i] = tmp;
+		AES1[i] = rotate(tmp, 8U);
+
+#if(ALGO == cryptonight_monero_v8 && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
 		RCP[i] = RCP_C[i];
 #endif
-    }
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
 
-    barrier(CLK_LOCAL_MEM_FENCE);
-// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
-#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
-    uint2 tweak1_2;
+#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
+	uint2 tweak1_2;
 #endif
 
 #if(COMP_MODE==1)
-    // do not use early return here
+	// do not use early return here
 	if(gIdx < Threads)
 #endif
-    {
-        states += 25 * gIdx;
+	{
+		states += 25 * gIdx;
 #if(STRIDED_INDEX==0)
-        Scratchpad += gIdx * (MEMORY >> 4);
+		Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
 		Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
-        Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
+		Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
 #elif(STRIDED_INDEX==3)
 		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
 #endif
 
-        a[0] = states[0] ^ states[4];
-        b[0] = states[2] ^ states[6];
-        a[1] = states[1] ^ states[5];
-        b[1] = states[3] ^ states[7];
+		a[0] = states[0] ^ states[4];
+		b[0] = states[2] ^ states[6];
+		a[1] = states[1] ^ states[5];
+		b[1] = states[3] ^ states[7];
 
 		b_x[0] = ((uint4 *)b)[0];
 
-// cryptonight_monero_v8
-#if(ALGO==11)
-        a[1] = states[1] ^ states[5];
-        b[2] = states[8] ^ states[10];
-        b[3] = states[9] ^ states[11];
+#if(ALGO == cryptonight_monero_v8)
+		a[1] = states[1] ^ states[5];
+		b[2] = states[8] ^ states[10];
+		b[3] = states[9] ^ states[11];
 		b_x[1] = ((uint4 *)b)[1];
 		division_result = as_uint2(states[12]);
 		sqrt_result = as_uint2(states[13]).s0;
 #endif
-// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
-#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
+
+#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
 		tweak1_2 = as_uint2(input[4]);
 		tweak1_2.s0 >>= 24;
 		tweak1_2.s0 |= tweak1_2.s1 << 8;
 		tweak1_2.s1 = (uint)get_global_id(0);
 		tweak1_2 ^= as_uint2(states[24]);
 #endif
-    }
+	}
 
-    mem_fence(CLK_LOCAL_MEM_FENCE);
+	mem_fence(CLK_LOCAL_MEM_FENCE);
 
 #if(COMP_MODE==1)
-    // do not use early return here
+	// do not use early return here
 	if(gIdx < Threads)
 #endif
-    {
+	{
 		uint idx0 = as_uint2(a[0]).s0 & MASK;
 
 		#pragma unroll CN_UNROLL
-    for(int i = 0; i < ITERATIONS; ++i)
-    {
+	for(int i = 0; i < ITERATIONS; ++i)
+	{
 			ulong c[2];
-// cryptonight_monero_v8 && NVIDIA
-#if(ALGO==11 && defined(__NV_CL_C_VERSION))
+
+#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
 			uint idxS = idx0 & 0x30U;
  			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
 #endif
 
 			((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0);
-// cryptonight_bittube2
-#if(ALGO == 10)
+
+#if(ALGO == cryptonight_conceal)
+			float4 r  = convert_float4_rte(((int4 *)c)[0]);
+			float4 c_old = conc_var;
+			r = _mm_add_ps(r, conc_var);
+			r = _mm_mul_ps(r, _mm_mul_ps(r, r));
+			r = _mm_and_ps(r, 0x807FFFFF);
+			r = _mm_or_ps(r, 0x40000000);
+			conc_var = _mm_add_ps(conc_var, r);
+
+			c_old = _mm_and_ps(c_old, 0x807FFFFF);
+			c_old = _mm_or_ps(c_old, 0x40000000);
+			float4 nc = _mm_mul_ps(c_old, (float4)(536870880.0f));
+			((int4 *)c)[0] ^= convert_int4_rte(nc);
+#endif
+
+#if(ALGO == cryptonight_bittube2)
 			((uint4 *)c)[0] = AES_Round2_bittube2(AES0, AES1, ~((uint4 *)c)[0], ((uint4 *)a)[0]);
 #else
 			((uint4 *)c)[0] = AES_Round2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
 #endif
 
-// cryptonight_monero_v8
-#if(ALGO==11)
-        {
+#if(ALGO == cryptonight_monero_v8)
+		{
 			ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
 			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
 			ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
 			SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
 			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
-            SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
-        }
+			SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+		}
 #endif
 
-// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
-#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
+#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
 			uint table = 0x75310U;
 			b_x[0] ^= ((uint4 *)c)[0];
-// cryptonight_stellite
-#	if(ALGO == 7)
+
+#	if(ALGO == cryptonight_stellite)
 			uint index = ((b_x[0].s2 >> 27) & 12) | ((b_x[0].s2 >> 23) & 2);
 #	else
 			uint index = ((b_x[0].s2 >> 26) & 12) | ((b_x[0].s2 >> 23) & 2);
@@ -692,8 +806,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			b_x[0].s2 ^= ((table >> index) & 0x30U) << 24;
 			SCRATCHPAD_CHUNK(0) = b_x[0];
 			idx0 = as_uint2(c[0]).s0 & MASK;
-// cryptonight_monero_v8
-#elif(ALGO==11)
+
+#elif(ALGO == cryptonight_monero_v8)
 			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
 #	ifdef __NV_CL_C_VERSION
 			// flush shuffled data
@@ -711,11 +825,11 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
 			uint4 tmp;
 			tmp = SCRATCHPAD_CHUNK(0);
-// cryptonight_monero_v8
-#if(ALGO==11)
+
+#if(ALGO == cryptonight_monero_v8)
 			// Use division and square root results from the _previous_ iteration to hide the latency
-            tmp.s0 ^= division_result.s0;
-            tmp.s1 ^= division_result.s1 ^ sqrt_result;
+			tmp.s0 ^= division_result.s0;
+			tmp.s1 ^= division_result.s1 ^ sqrt_result;
  			// Most and least significant bits in the divisor are set to 1
 			// to make sure we don't divide by a small or even number,
 			// so there are no shortcuts for such cases
@@ -748,11 +862,10 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			a[1] += c[0] * as_ulong2(tmp).s0;
 			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
 #endif
-// cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
-#if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
 
-// cryptonight_ipbc || cryptonight_bittube2
-#	if(ALGO == 6 || ALGO == 10)
+#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
+
+#	if(ALGO == cryptonight_ipbc || ALGO == cryptonight_bittube2)
 			uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
 			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
@@ -767,10 +880,9 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 #endif
 
-        ((uint4 *)a)[0] ^= tmp;
+		((uint4 *)a)[0] ^= tmp;
 
-// cryptonight_monero_v8
-#if (ALGO == 11)
+#if (ALGO == cryptonight_monero_v8)
 #	if defined(__NV_CL_C_VERSION)
 			// flush shuffled data
 			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
@@ -780,15 +892,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			b_x[0] = ((uint4 *)c)[0];
 			idx0 = as_uint2(a[0]).s0 & MASK;
 
-// cryptonight_heavy || cryptonight_bittube2
-#if (ALGO == 4 || ALGO == 10)
+#if (ALGO == cryptonight_heavy || ALGO == cryptonight_bittube2)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
-            long q = fast_div_heavy(n, d | 0x5);
+			long q = fast_div_heavy(n, d | 0x5);
 			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
 			idx0 = (d ^ as_int2(q).s0) & MASK;
-// cryptonight_haven || cryptonight_superfast
-#elif (ALGO == 9 || ALGO == 12)
+#elif (ALGO == cryptonight_haven || ALGO == cryptonight_superfast)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
 			long q = fast_div_heavy(n, d | 0x5);
@@ -796,461 +906,504 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			idx0 = ((~d) ^ as_int2(q).s0) & MASK;
 #endif
 
-    }
-    }
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
+	}
+	}
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
 )==="
 R"===(
 
 __attribute__((reqd_work_group_size(8, 8, 1)))
-__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads)
+__kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states,
+
+#if (ALGO == cryptonight_gpu)
+	__global uint *output, ulong Target, uint Threads)
+#else
+	__global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, uint Threads)
+#endif
 {
-    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
-    uint ExpandedKey2[40];
-    uint4 text;
-
-    const uint gIdx = getIdx();
-
-    for (int i = get_local_id(1) * 8 + get_local_id(0); i < 256; i += 8 * 8) {
-        const uint tmp = AES0_C[i];
-        AES0[i] = tmp;
-        AES1[i] = rotate(tmp, 8U);
-        AES2[i] = rotate(tmp, 16U);
-        AES3[i] = rotate(tmp, 24U);
-    }
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10  || ALGO == 12)
-    __local uint4 xin1[8][8];
-    __local uint4 xin2[8][8];
+	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
+	uint ExpandedKey2[40];
+	uint4 text;
+
+	const uint gIdx = getIdx();
+
+	for (int i = get_local_id(1) * 8 + get_local_id(0); i < 256; i += 8 * 8) {
+		const uint tmp = AES0_C[i];
+		AES0[i] = tmp;
+		AES1[i] = rotate(tmp, 8U);
+		AES2[i] = rotate(tmp, 16U);
+		AES3[i] = rotate(tmp, 24U);
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2  || ALGO == cryptonight_superfast)
+	__local uint4 xin1[8][8];
+	__local uint4 xin2[8][8];
 #endif
 
 #if(COMP_MODE==1)
-    // do not use early return here
-    if(gIdx < Threads)
+	// do not use early return here
+	if(gIdx < Threads)
 #endif
-    {
-        states += 25 * gIdx;
+	{
+		states += 25 * gIdx;
 #if(STRIDED_INDEX==0)
-        Scratchpad += gIdx * (MEMORY >> 4);
+		Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
 		Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
-        Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
+		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
 #elif(STRIDED_INDEX==3)
 		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
 #endif
 
-        #if defined(__Tahiti__) || defined(__Pitcairn__)
+		#if defined(__Tahiti__) || defined(__Pitcairn__)
 
-        for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4];
-        text = vload4(get_local_id(1) + 4, (__global uint *)states);
+		for(int i = 0; i < 4; ++i) ((ulong *)ExpandedKey2)[i] = states[i + 4];
+		text = vload4(get_local_id(1) + 4, (__global uint *)states);
 
-        #else
+		#else
 
-        text = vload4(get_local_id(1) + 4, (__global uint *)states);
-        ((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states);
+		text = vload4(get_local_id(1) + 4, (__global uint *)states);
+		((uint8 *)ExpandedKey2)[0] = vload8(1, (__global uint *)states);
 
-        #endif
+		#endif
 
-        AESExpandKey256(ExpandedKey2);
-    }
+		AESExpandKey256(ExpandedKey2);
+	}
 
-    barrier(CLK_LOCAL_MEM_FENCE);
+	barrier(CLK_LOCAL_MEM_FENCE);
 
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
-    __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)];
-    __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)];
-    __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)];
-    __local uint4* xin2_load = &xin2[(get_local_id(1) + 1) % 8][get_local_id(0)];
-    *xin2_store = (uint4)(0, 0, 0, 0);
+#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+	__local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)];
+	__local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)];
+	__local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)];
+	__local uint4* xin2_load = &xin2[(get_local_id(1) + 1) % 8][get_local_id(0)];
+	*xin2_store = (uint4)(0, 0, 0, 0);
 #endif
 
 #if(COMP_MODE == 1)
-    // do not use early return here
-    if (gIdx < Threads)
+	// do not use early return here
+	if (gIdx < Threads)
 #endif
-    {
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
-        #pragma unroll 2
-        for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4))
-        {
-            text ^= Scratchpad[IDX((uint)i1)];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            text ^= *xin2_load;
+	{
+
+#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+		#pragma unroll 2
+		for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4))
+		{
+			text ^= Scratchpad[IDX((uint)i1)];
+			barrier(CLK_LOCAL_MEM_FENCE);
+			text ^= *xin2_load;
 
-            #pragma unroll 10
-            for(int j = 0; j < 10; ++j)
-                text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+			#pragma unroll 10
+			for(int j = 0; j < 10; ++j)
+			    text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
 
-            *xin1_store = text;
+			*xin1_store = text;
 
-            text ^= Scratchpad[IDX((uint)i1 + 8u)];
-            barrier(CLK_LOCAL_MEM_FENCE);
-            text ^= *xin1_load;
+			text ^= Scratchpad[IDX((uint)i1 + 8u)];
+			barrier(CLK_LOCAL_MEM_FENCE);
+			text ^= *xin1_load;
 
-            #pragma unroll 10
-            for(int j = 0; j < 10; ++j)
-                text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+			#pragma unroll 10
+			for(int j = 0; j < 10; ++j)
+			    text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
 
-            *xin2_store = text;
-        }
+			*xin2_store = text;
+		}
 
-        barrier(CLK_LOCAL_MEM_FENCE);
-        text ^= *xin2_load;
+		barrier(CLK_LOCAL_MEM_FENCE);
+		text ^= *xin2_load;
 
 #else
-        #pragma unroll 2
-        for (int i = 0; i < (MEMORY >> 7); ++i) {
-            text ^= Scratchpad[IDX((uint)((i << 3) + get_local_id(1)))];
-
-            #pragma unroll 10
-            for(int j = 0; j < 10; ++j)
-                text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
-        }
+		#pragma unroll 2
+		for (int i = 0; i < (MEMORY >> 7); ++i) {
+			text ^= Scratchpad[IDX((uint)((i << 3) + get_local_id(1)))];
+
+			#pragma unroll 10
+			for(int j = 0; j < 10; ++j)
+			    text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+		}
 #endif
-    }
-
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
-    /* Also left over threads performe this loop.
-     * The left over thread results will be ignored
-     */
-    #pragma unroll 16
-    for(size_t i = 0; i < 16; i++)
-    {
-        #pragma unroll 10
-        for (int j = 0; j < 10; ++j) {
-            text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
-        }
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-        *xin1_store = text;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        text ^= *xin1_load;
-    }
+	}
+
+#if (ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+	/* Also left over threads performe this loop.
+	 * The left over thread results will be ignored
+	 */
+	#pragma unroll 16
+	for(size_t i = 0; i < 16; i++)
+	{
+		#pragma unroll 10
+		for (int j = 0; j < 10; ++j) {
+			text = AES_Round(AES0, AES1, AES2, AES3, text, ((uint4 *)ExpandedKey2)[j]);
+		}
+
+		barrier(CLK_LOCAL_MEM_FENCE);
+		*xin1_store = text;
+		barrier(CLK_LOCAL_MEM_FENCE);
+		text ^= *xin1_load;
+	}
 #endif
 
-    __local ulong State_buf[8 * 25];
+	__local ulong State_buf[8 * 25];
 #if(COMP_MODE==1)
-    // do not use early return here
-    if(gIdx < Threads)
+	// do not use early return here
+	if(gIdx < Threads)
 #endif
-    {
-        vstore2(as_ulong2(text), get_local_id(1) + 4, states);
-    }
+	{
+		vstore2(as_ulong2(text), get_local_id(1) + 4, states);
+	}
 
-    barrier(CLK_GLOBAL_MEM_FENCE);
+	barrier(CLK_GLOBAL_MEM_FENCE);
 
 #if(COMP_MODE==1)
-    // do not use early return here
-    if(gIdx < Threads)
+	// do not use early return here
+	if(gIdx < Threads)
 #endif
-    {
-        if(!get_local_id(1))
-        {
-            __local ulong* State = State_buf + get_local_id(0) * 25;
+	{
+		if(!get_local_id(1))
+		{
+			__local ulong* State = State_buf + get_local_id(0) * 25;
 
-            for(int i = 0; i < 25; ++i) State[i] = states[i];
+			for(int i = 0; i < 25; ++i) State[i] = states[i];
 
-            keccakf1600_2(State);
+			keccakf1600_2(State);
 
-            for(int i = 0; i < 25; ++i) states[i] = State[i];
+#if (ALGO == cryptonight_gpu)
+			if(State[3] <= Target)
+			{
+				ulong outIdx = atomic_inc(output + 0xFF);
+				if(outIdx < 0xFF)
+					output[outIdx] = get_global_id(0);
+			}
+#else
+			for(int i = 0; i < 25; ++i) states[i] = State[i];
 
-            uint StateSwitch = State[0] & 3;
-            __global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1;
-            __global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3;
-            __global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2;
-            destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx;
-        }
-    }
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
+			uint StateSwitch = State[0] & 3;
+			__global uint *destinationBranch1 = StateSwitch == 0 ? Branch0 : Branch1;
+			__global uint *destinationBranch2 = StateSwitch == 2 ? Branch2 : Branch3;
+			__global uint *destinationBranch = StateSwitch < 2 ? destinationBranch1 : destinationBranch2;
+			destinationBranch[atomic_inc(destinationBranch + Threads)] = gIdx;
+#endif
+		}
+	}
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
 )==="
 R"===(
 
 #define VSWAP8(x)	(((x) >> 56) | (((x) >> 40) & 0x000000000000FF00UL) | (((x) >> 24) & 0x0000000000FF0000UL) \
-          | (((x) >>  8) & 0x00000000FF000000UL) | (((x) <<  8) & 0x000000FF00000000UL) \
-          | (((x) << 24) & 0x0000FF0000000000UL) | (((x) << 40) & 0x00FF000000000000UL) | (((x) << 56) & 0xFF00000000000000UL))
+		  | (((x) >>  8) & 0x00000000FF000000UL) | (((x) <<  8) & 0x000000FF00000000UL) \
+		  | (((x) << 24) & 0x0000FF0000000000UL) | (((x) << 40) & 0x00FF000000000000UL) | (((x) << 56) & 0xFF00000000000000UL))
 
 #define VSWAP4(x)	((((x) >> 24) & 0xFFU) | (((x) >> 8) & 0xFF00U) | (((x) << 8) & 0xFF0000U) | (((x) << 24) & 0xFF000000U))
 
 __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
 {
-    const ulong idx = get_global_id(0) - get_global_offset(0);
-
-    // do not use early return here
-    if(idx < BranchBuf[Threads])
-    {
-        states += 25 * BranchBuf[idx];
+	const uint idx = get_global_id(0) - get_global_offset(0);
 
-        // skein
-        ulong8 h = vload8(0, SKEIN512_256_IV);
+	// do not use early return here
+	if(idx < BranchBuf[Threads])
+	{
+		states += 25 * BranchBuf[idx];
 
-        // Type field begins with final bit, first bit, then six bits of type; the last 96
-        // bits are input processed (including in the block to be processed with that tweak)
-        // The output transform is only one run of UBI, since we need only 256 bits of output
-        // The tweak for the output transform is Type = Output with the Final bit set
-        // T[0] for the output is 8, and I don't know why - should be message size...
-        ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL };
-        ulong8 p, m;
+		// skein
+		ulong8 h = vload8(0, SKEIN512_256_IV);
 
-        for(uint i = 0; i < 4; ++i)
-        {
-            t[0] += i < 3 ? 0x40UL : 0x08UL;
+		// Type field begins with final bit, first bit, then six bits of type; the last 96
+		// bits are input processed (including in the block to be processed with that tweak)
+		// The output transform is only one run of UBI, since we need only 256 bits of output
+		// The tweak for the output transform is Type = Output with the Final bit set
+		// T[0] for the output is 8, and I don't know why - should be message size...
+		ulong t[3] = { 0x00UL, 0x7000000000000000UL, 0x00UL };
+		ulong8 p, m;
 
-            t[2] = t[0] ^ t[1];
+		#pragma unroll 1
+		for (uint i = 0; i < 4; ++i)
+		{
+			t[0] += i < 3 ? 0x40UL : 0x08UL;
 
-            m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
-            const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
-            p = Skein512Block(m, h, h8, t);
+			t[2] = t[0] ^ t[1];
 
-            h = m ^ p;
+			m = (i < 3) ? vload8(i, states) : (ulong8)(states[24], 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
+			const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
+			p = Skein512Block(m, h, h8, t);
 
-            t[1] = i < 2 ? 0x3000000000000000UL : 0xB000000000000000UL;
-        }
+			h = m ^ p;
 
-        t[0] = 0x08UL;
-        t[1] = 0xFF00000000000000UL;
-        t[2] = t[0] ^ t[1];
+			t[1] = i < 2 ? 0x3000000000000000UL : 0xB000000000000000UL;
+		}
 
-        p = (ulong8)(0);
-        const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
+		t[0] = 0x08UL;
+		t[1] = 0xFF00000000000000UL;
+		t[2] = t[0] ^ t[1];
 
-        p = Skein512Block(p, h, h8, t);
+		p = (ulong8)(0);
+		const ulong h8 = h.s0 ^ h.s1 ^ h.s2 ^ h.s3 ^ h.s4 ^ h.s5 ^ h.s6 ^ h.s7 ^ SKEIN_KS_PARITY;
 
-        //vstore8(p, 0, output);
+		p = Skein512Block(p, h, h8, t);
 
-        // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-        // and expect an accurate result for target > 32-bit without implementing carries
-		if(p.s3 <= Target)
+		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+		// and expect an accurate result for target > 32-bit without implementing carries
+		if (p.s3 <= Target)
 		{
-            ulong outIdx = atomic_inc(output + 0xFF);
+			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
-				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-        }
-    }
-    mem_fence(CLK_GLOBAL_MEM_FENCE);
+			    output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
+		}
+	}
+	mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
 #define SWAP8(x)	as_ulong(as_uchar8(x).s76543210)
 
 #define JHXOR \
-    h0h ^= input[0]; \
-    h0l ^= input[1]; \
-    h1h ^= input[2]; \
-    h1l ^= input[3]; \
-    h2h ^= input[4]; \
-    h2l ^= input[5]; \
-    h3h ^= input[6]; \
-    h3l ^= input[7]; \
+	h0h ^= input[0]; \
+	h0l ^= input[1]; \
+	h1h ^= input[2]; \
+	h1l ^= input[3]; \
+	h2h ^= input[4]; \
+	h2l ^= input[5]; \
+	h3h ^= input[6]; \
+	h3l ^= input[7]; \
 \
-    E8; \
+	E8; \
 \
-    h4h ^= input[0]; \
-    h4l ^= input[1]; \
-    h5h ^= input[2]; \
-    h5l ^= input[3]; \
-    h6h ^= input[4]; \
-    h6l ^= input[5]; \
-    h7h ^= input[6]; \
-    h7l ^= input[7]
+	h4h ^= input[0]; \
+	h4l ^= input[1]; \
+	h5h ^= input[2]; \
+	h5l ^= input[3]; \
+	h6h ^= input[4]; \
+	h6l ^= input[5]; \
+	h7h ^= input[6]; \
+	h7l ^= input[7]
 
 __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
 {
-    const uint idx = get_global_id(0) - get_global_offset(0);
-
-    // do not use early return here
-    if(idx < BranchBuf[Threads])
-    {
-        states += 25 * BranchBuf[idx];
-
-        sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL;
-        sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
-        sph_u64 tmp;
-
-        for(int i = 0; i < 3; ++i)
-        {
-            ulong input[8];
-
-            const int shifted = i << 3;
-            for(int x = 0; x < 8; ++x) input[x] = (states[shifted + x]);
-            JHXOR;
-        }
-        {
-            ulong input[8];
-            input[0] = (states[24]);
-            input[1] = 0x80UL;
-            #pragma unroll 6
-            for(int x = 2; x < 8; ++x) input[x] = 0x00UL;
-            JHXOR;
-        }
-        {
-            ulong input[8];
-            for(int x = 0; x < 7; ++x) input[x] = 0x00UL;
-            input[7] = 0x4006000000000000UL;
-            JHXOR;
-        }
-
-        //output[0] = h6h;
-        //output[1] = h6l;
-        //output[2] = h7h;
-        //output[3] = h7l;
-
-        // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-        // and expect an accurate result for target > 32-bit without implementing carries
+	const uint idx = get_global_id(0) - get_global_offset(0);
+
+	// do not use early return here
+	if(idx < BranchBuf[Threads])
+	{
+		states += 25 * BranchBuf[idx];
+
+		sph_u64 h0h = 0xEBD3202C41A398EBUL, h0l = 0xC145B29C7BBECD92UL, h1h = 0xFAC7D4609151931CUL, h1l = 0x038A507ED6820026UL, h2h = 0x45B92677269E23A4UL, h2l = 0x77941AD4481AFBE0UL, h3h = 0x7A176B0226ABB5CDUL, h3l = 0xA82FFF0F4224F056UL;
+		sph_u64 h4h = 0x754D2E7F8996A371UL, h4l = 0x62E27DF70849141DUL, h5h = 0x948F2476F7957627UL, h5l = 0x6C29804757B6D587UL, h6h = 0x6C0D8EAC2D275E5CUL, h6l = 0x0F7A0557C6508451UL, h7h = 0xEA12247067D3E47BUL, h7l = 0x69D71CD313ABE389UL;
+		sph_u64 tmp;
+
+		for(uint i = 0; i < 3; ++i)
+		{
+			ulong input[8];
+
+			const int shifted = i << 3;
+			for (uint x = 0; x < 8; ++x)
+			{
+			    input[x] = (states[shifted + x]);
+			}
+
+			JHXOR;
+		}
+
+		{
+			ulong input[8] = { (states[24]), 0x80UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL };
+			JHXOR;
+		}
+
+		{
+			ulong input[8] = { 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x00UL, 0x4006000000000000UL };
+			JHXOR;
+		}
+
+		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+		// and expect an accurate result for target > 32-bit without implementing carries
 		if(h7l <= Target)
 		{
-            ulong outIdx = atomic_inc(output + 0xFF);
+			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
 				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-            }
-        }
-    }
+		}
+	}
+}
 
 #define SWAP4(x)	as_uint(as_uchar4(x).s3210)
 
 __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
 {
-    const uint idx = get_global_id(0) - get_global_offset(0);
-
-    // do not use early return here
-    if(idx < BranchBuf[Threads])
-    {
-        states += 25 * BranchBuf[idx];
-
-        unsigned int m[16];
-        unsigned int v[16];
-        uint h[8];
-
-        ((uint8 *)h)[0] = vload8(0U, c_IV256);
-
-        #pragma unroll 4
-        for(uint i = 0, bitlen = 0; i < 4; ++i)
-        {
-            if(i < 3)
-            {
-                ((uint16 *)m)[0] = vload16(i, (__global uint *)states);
-                for(int i = 0; i < 16; ++i) m[i] = SWAP4(m[i]);
-                bitlen += 512;
-            }
-            else
-            {
-                m[0] = SWAP4(((__global uint *)states)[48]);
-                m[1] = SWAP4(((__global uint *)states)[49]);
-                m[2] = 0x80000000U;
-
-                for(int i = 3; i < 13; ++i) m[i] = 0x00U;
-
-                m[13] = 1U;
-                m[14] = 0U;
-                m[15] = 0x640;
-                bitlen += 64;
-            }
-
-            ((uint16 *)v)[0].lo = ((uint8 *)h)[0];
-            ((uint16 *)v)[0].hi = vload8(0U, c_u256);
-
-            //v[12] ^= (i < 3) ? (i + 1) << 9 : 1600U;
-            //v[13] ^= (i < 3) ? (i + 1) << 9 : 1600U;
-
-            v[12] ^= bitlen;
-            v[13] ^= bitlen;
-
-            for(int r = 0; r < 14; r++)
-            {
-                GS(0, 4, 0x8, 0xC, 0x0);
-                GS(1, 5, 0x9, 0xD, 0x2);
-                GS(2, 6, 0xA, 0xE, 0x4);
-                GS(3, 7, 0xB, 0xF, 0x6);
-                GS(0, 5, 0xA, 0xF, 0x8);
-                GS(1, 6, 0xB, 0xC, 0xA);
-                GS(2, 7, 0x8, 0xD, 0xC);
-                GS(3, 4, 0x9, 0xE, 0xE);
-            }
-
-            ((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1];
-        }
-
-        for(int i = 0; i < 8; ++i) h[i] = SWAP4(h[i]);
-
-        // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-        // and expect an accurate result for target > 32-bit without implementing carries
-        uint2 t = (uint2)(h[6],h[7]);
-		if( as_ulong(t) <= Target)
+	const uint idx = get_global_id(0) - get_global_offset(0);
+
+	// do not use early return here
+	if(idx < BranchBuf[Threads])
+	{
+		states += 25 * BranchBuf[idx];
+
+		unsigned int m[16];
+		unsigned int v[16];
+		uint h[8];
+		uint bitlen = 0;
+
+		((uint8 *)h)[0] = vload8(0U, c_IV256);
+
+		for (uint i = 0; i < 3; ++i)
+		{
+			((uint16 *)m)[0] = vload16(i, (__global uint *)states);
+			for (uint x = 0; x < 16; ++x)
+			{
+			    m[x] = SWAP4(m[x]);
+			}
+
+			bitlen += 512;
+
+			((uint16 *)v)[0].lo = ((uint8 *)h)[0];
+			((uint16 *)v)[0].hi = vload8(0U, c_u256);
+
+			v[12] ^= bitlen;
+			v[13] ^= bitlen;
+
+			for (uint r = 0; r < 14; r++) {
+			    GS(0, 4, 0x8, 0xC, 0x0);
+			    GS(1, 5, 0x9, 0xD, 0x2);
+			    GS(2, 6, 0xA, 0xE, 0x4);
+			    GS(3, 7, 0xB, 0xF, 0x6);
+			    GS(0, 5, 0xA, 0xF, 0x8);
+			    GS(1, 6, 0xB, 0xC, 0xA);
+			    GS(2, 7, 0x8, 0xD, 0xC);
+			    GS(3, 4, 0x9, 0xE, 0xE);
+			}
+
+			((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1];
+		}
+
+		m[0]  = SWAP4(((__global uint *)states)[48]);
+		m[1]  = SWAP4(((__global uint *)states)[49]);
+		m[2]  = 0x80000000U;
+		m[3]  = 0x00U;
+		m[4]  = 0x00U;
+		m[5]  = 0x00U;
+		m[6]  = 0x00U;
+		m[7]  = 0x00U;
+		m[8]  = 0x00U;
+		m[9]  = 0x00U;
+		m[10] = 0x00U;
+		m[11] = 0x00U;
+		m[12] = 0x00U;
+		m[13] = 1U;
+		m[14] = 0U;
+		m[15] = 0x640;
+
+		bitlen += 64;
+
+		((uint16 *)v)[0].lo = ((uint8 *)h)[0];
+		((uint16 *)v)[0].hi = vload8(0U, c_u256);
+
+		v[12] ^= bitlen;
+		v[13] ^= bitlen;
+
+		for (uint r = 0; r < 14; r++) {
+			GS(0, 4, 0x8, 0xC, 0x0);
+			GS(1, 5, 0x9, 0xD, 0x2);
+			GS(2, 6, 0xA, 0xE, 0x4);
+			GS(3, 7, 0xB, 0xF, 0x6);
+			GS(0, 5, 0xA, 0xF, 0x8);
+			GS(1, 6, 0xB, 0xC, 0xA);
+			GS(2, 7, 0x8, 0xD, 0xC);
+			GS(3, 4, 0x9, 0xE, 0xE);
+		}
+
+		((uint8 *)h)[0] ^= ((uint8 *)v)[0] ^ ((uint8 *)v)[1];
+
+		for (uint i = 0; i < 8; ++i) {
+			h[i] = SWAP4(h[i]);
+		}
+
+		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+		// and expect an accurate result for target > 32-bit without implementing carries
+		uint2 t = (uint2)(h[6],h[7]);
+		if(as_ulong(t) <= Target)
 		{
-            ulong outIdx = atomic_inc(output + 0xFF);
+			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
 				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-            }
-        }
-    }
-
-__kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
-{
-    const uint idx = get_global_id(0) - get_global_offset(0);
+		}
+	}
+}
 
-    // do not use early return here
-    if(idx < BranchBuf[Threads])
-    {
-        states += 25 * BranchBuf[idx];
+#undef SWAP4
 
-        ulong State[8];
 
-        for(int i = 0; i < 7; ++i) State[i] = 0UL;
+__kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global uint *output, ulong Target, uint Threads)
+{
+	const uint idx = get_global_id(0) - get_global_offset(0);
 
-        State[7] = 0x0001000000000000UL;
+	// do not use early return here
+	if(idx < BranchBuf[Threads])
+	{
+		states += 25 * BranchBuf[idx];
 
-        #pragma unroll 4
-        for(uint i = 0; i < 4; ++i)
-        {
-            volatile ulong H[8], M[8];
+		ulong State[8] = { 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0x0001000000000000UL };
+#if defined(__clang__) && !defined(__NV_CL_C_VERSION)
+		// on ROCM we need volatile for AMD RX5xx cards to avoid invalid shares
+		volatile
+#endif
+		ulong H[8], M[8];
 
-            if(i < 3)
-            {
-                ((ulong8 *)M)[0] = vload8(i, states);
-            }
-            else
-            {
-                M[0] = states[24];
-                M[1] = 0x80UL;
+		for (uint i = 0; i < 3; ++i) {
+			((ulong8 *)M)[0] = vload8(i, states);
 
-                for(int x = 2; x < 7; ++x) M[x] = 0UL;
+			for (uint x = 0; x < 8; ++x) {
+			    H[x] = M[x] ^ State[x];
+			}
 
-                M[7] = 0x0400000000000000UL;
-            }
+			PERM_SMALL_P(H);
+			PERM_SMALL_Q(M);
 
-            for(int x = 0; x < 8; ++x) H[x] = M[x] ^ State[x];
+			for (uint x = 0; x < 8; ++x)
+			{
+			    State[x] ^= H[x] ^ M[x];
+			}
+		}
 
-            PERM_SMALL_P(H);
-            PERM_SMALL_Q(M);
+		M[0] = states[24];
+		M[1] = 0x80UL;
+		M[2] = 0UL;
+		M[3] = 0UL;
+		M[4] = 0UL;
+		M[5] = 0UL;
+		M[6] = 0UL;
+		M[7] = 0x0400000000000000UL;
 
-            for(int x = 0; x < 8; ++x) State[x] ^= H[x] ^ M[x];
-        }
+		for (uint x = 0; x < 8; ++x) {
+			H[x] = M[x] ^ State[x];
+		}
 
-        ulong tmp[8];
+		PERM_SMALL_P(H);
+		PERM_SMALL_Q(M);
 
-        for(int i = 0; i < 8; ++i) tmp[i] = State[i];
+		ulong tmp[8];
+		for (uint i = 0; i < 8; ++i) {
+			tmp[i] = State[i] ^= H[i] ^ M[i];
+		}
 
-        PERM_SMALL_P(State);
+		PERM_SMALL_P(State);
 
-        for(int i = 0; i < 8; ++i) State[i] ^= tmp[i];
+		for (uint i = 0; i < 8; ++i) {
+			State[i] ^= tmp[i];
+		}
 
-        // Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-        // and expect an accurate result for target > 32-bit without implementing carries
+		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+		// and expect an accurate result for target > 32-bit without implementing carries
 		if(State[7] <= Target)
 		{
-            ulong outIdx = atomic_inc(output + 0xFF);
+			ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
 				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-            }
-        }
-    }
+		}
+	}
+}
 
 )==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
new file mode 100644
index 000000000..e87819760
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_gpu.cl
@@ -0,0 +1,329 @@
+R"===(
+
+inline global int4* scratchpad_ptr(uint idx, uint n, __global int *lpad) { return (__global int4*)((__global char*)lpad + (idx & MASK) + n * 16); }
+
+inline float4 fma_break(float4 x)
+{
+	// Break the dependency chain by setitng the exp to ?????01
+	x = _mm_and_ps(x, 0xFEFFFFFF);
+	return _mm_or_ps(x, 0x00800000);
+}
+
+inline void sub_round(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* n, float4* d, float4* c)
+{
+	n1 = _mm_add_ps(n1, *c);
+	float4 nn = _mm_mul_ps(n0, *c);
+	nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
+	nn = fma_break(nn);
+	*n = _mm_add_ps(*n, nn);
+
+	n3 = _mm_sub_ps(n3, *c);
+	float4 dd = _mm_mul_ps(n2, *c);
+	dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
+	dd = fma_break(dd);
+	*d = _mm_add_ps(*d, dd);
+
+	//Constant feedback
+	*c = _mm_add_ps(*c, rnd_c);
+	*c = _mm_add_ps(*c, (float4)(0.734375f));
+	float4 r = _mm_add_ps(nn, dd);
+	r = _mm_and_ps(r, 0x807FFFFF);
+	r = _mm_or_ps(r, 0x40000000);
+	*c = _mm_add_ps(*c, r);
+
+}
+
+// 9*8 + 2 = 74
+inline void round_compute(float4 n0, float4 n1, float4 n2, float4 n3, float4 rnd_c, float4* c, float4* r)
+{
+	float4 n = (float4)(0.0f);
+	float4 d = (float4)(0.0f);
+
+	sub_round(n0, n1, n2, n3, rnd_c, &n, &d, c);
+	sub_round(n1, n2, n3, n0, rnd_c, &n, &d, c);
+	sub_round(n2, n3, n0, n1, rnd_c, &n, &d, c);
+	sub_round(n3, n0, n1, n2, rnd_c, &n, &d, c);
+	sub_round(n3, n2, n1, n0, rnd_c, &n, &d, c);
+	sub_round(n2, n1, n0, n3, rnd_c, &n, &d, c);
+	sub_round(n1, n0, n3, n2, rnd_c, &n, &d, c);
+	sub_round(n0, n3, n2, n1, rnd_c, &n, &d, c);
+
+	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
+	d = _mm_and_ps(d, 0xFF7FFFFF);
+	d = _mm_or_ps(d, 0x40000000);
+	*r =_mm_add_ps(*r, _mm_div_ps(n,d));
+}
+
+inline int4 single_comupte(float4 n0, float4 n1, float4 n2, float4 n3, float cnt, float4 rnd_c, __local float4* sum)
+{
+	float4 c= (float4)(cnt);
+	// 35 maths calls follow (140 FLOPS)
+	float4 r = (float4)(0.0f);
+
+	for(int i = 0; i < 4; ++i)
+		round_compute(n0, n1, n2, n3, rnd_c, &c, &r);
+
+	// do a quick fmod by setting exp to 2
+	r = _mm_and_ps(r, 0x807FFFFF);
+	r = _mm_or_ps(r, 0x40000000);
+	*sum = r; // 34
+	float4 x = (float4)(536870880.0f);
+	r = _mm_mul_ps(r, x); // 35
+	return convert_int4_rte(r);
+}
+
+inline void single_comupte_wrap(const uint rot, int4 v0, int4 v1, int4 v2, int4 v3, float cnt, float4 rnd_c, __local float4* sum, __local int4* out)
+{
+	float4 n0 = convert_float4_rte(v0);
+	float4 n1 = convert_float4_rte(v1);
+	float4 n2 = convert_float4_rte(v2);
+	float4 n3 = convert_float4_rte(v3);
+
+	int4 r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum);
+	*out = rot == 0 ? r : _mm_alignr_epi8(r, rot);
+}
+
+)==="
+R"===(
+
+static const __constant uint look[16][4] = {
+	{0, 1, 2, 3},
+	{0, 2, 3, 1},
+	{0, 3, 1, 2},
+	{0, 3, 2, 1},
+
+	{1, 0, 2, 3},
+	{1, 2, 3, 0},
+	{1, 3, 0, 2},
+	{1, 3, 2, 0},
+
+	{2, 1, 0, 3},
+	{2, 0, 3, 1},
+	{2, 3, 1, 0},
+	{2, 3, 0, 1},
+
+	{3, 1, 2, 0},
+	{3, 2, 0, 1},
+	{3, 0, 1, 2},
+	{3, 0, 2, 1}
+};
+
+static const __constant float ccnt[16] = {
+	1.34375f,
+	1.28125f,
+	1.359375f,
+	1.3671875f,
+
+	1.4296875f,
+	1.3984375f,
+	1.3828125f,
+	1.3046875f,
+
+	1.4140625f,
+	1.2734375f,
+	1.2578125f,
+	1.2890625f,
+
+	1.3203125f,
+	1.3515625f,
+	1.3359375f,
+	1.4609375f
+};
+
+struct SharedMemChunk
+{
+	int4 out[16];
+	float4 va[16];
+};
+
+__attribute__((reqd_work_group_size(WORKSIZE * 16, 1, 1)))
+__kernel void JOIN(cn1_cn_gpu,ALGO)(__global int *lpad_in, __global int *spad, uint numThreads)
+{
+	const uint gIdx = getIdx();
+
+#if(COMP_MODE==1)
+	if(gIdx/16 >= numThreads)
+		return;
+#endif
+
+	uint chunk = get_local_id(0) / 16;
+
+#if(STRIDED_INDEX==0)
+	__global int* lpad = (__global int*)((__global char*)lpad_in + MEMORY * (gIdx/16));
+#endif
+
+	__local struct SharedMemChunk smem_in[WORKSIZE];
+	__local struct SharedMemChunk* smem = smem_in + chunk;
+
+	uint tid = get_local_id(0) % 16;
+
+	uint idxHash = gIdx/16;
+	uint s = ((__global uint*)spad)[idxHash * 50] >> 8;
+	float4 vs = (float4)(0);
+
+	// tid divided
+	const uint tidd = tid / 4;
+	// tid modulo
+	const uint tidm = tid % 4;
+	const uint block = tidd * 16 + tidm;
+
+	#pragma unroll CN_UNROLL
+	for(size_t i = 0; i < ITERATIONS; i++)
+	{
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+		int tmp = ((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm];
+		((__local int*)(smem->out))[tid] = tmp;
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+
+		{
+			single_comupte_wrap(
+				tidm,
+				*(smem->out + look[tid][0]),
+				*(smem->out + look[tid][1]),
+				*(smem->out + look[tid][2]),
+				*(smem->out + look[tid][3]),
+				ccnt[tid], vs, smem->va + tid,
+				smem->out + tid
+			);
+		}
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+
+		int outXor = ((__local int*)smem->out)[block];
+		for(uint dd = block + 4; dd < (tidd + 1) * 16; dd += 4)
+			outXor ^= ((__local int*)smem->out)[dd];
+
+		((__global int*)scratchpad_ptr(s, tidd, lpad))[tidm] = outXor ^ tmp;
+		((__local int*)smem->out)[tid] = outXor;
+
+		float va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4];
+		float va_tmp2 = ((__local float*)smem->va)[block+ 8] + ((__local float*)smem->va)[block + 12];
+		((__local float*)smem->va)[tid] = va_tmp1 + va_tmp2;
+
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+
+		int out2 = ((__local int*)smem->out)[tid] ^ ((__local int*)smem->out)[tid + 4 ] ^ ((__local int*)smem->out)[tid + 8] ^ ((__local int*)smem->out)[tid + 12];
+		va_tmp1 = ((__local float*)smem->va)[block] + ((__local float*)smem->va)[block + 4];
+		va_tmp2 = ((__local float*)smem->va)[block + 8] + ((__local float*)smem->va)[block + 12];
+		va_tmp1 = va_tmp1 + va_tmp2;
+		va_tmp1 = fabs(va_tmp1);
+
+		float xx = va_tmp1 * 16777216.0f;
+		int xx_int = (int)xx;
+		((__local int*)smem->out)[tid] = out2 ^ xx_int;
+		((__local float*)smem->va)[tid] = va_tmp1 / 64.0f;
+
+		mem_fence(CLK_LOCAL_MEM_FENCE);
+
+		vs = smem->va[0];
+		s = smem->out[0].x ^ smem->out[0].y ^ smem->out[0].z ^ smem->out[0].w;
+	}
+}
+
+)==="
+R"===(
+
+static const __constant uint skip[3] = {
+	20,22,22
+};
+
+inline void generate_512(uint idx, __local ulong* in, __global ulong* out)
+{
+	ulong hash[25];
+
+	hash[0] = in[0] ^ idx;
+	for(int i = 1; i < 25; ++i)
+		hash[i] = in[i];
+
+	for(int a = 0; a < 3;++a)
+	{
+		keccakf1600_1(hash);
+		for(int i = 0; i < skip[a]; ++i)
+			out[i] = hash[i];
+		out+=skip[a];
+	}
+}
+
+__attribute__((reqd_work_group_size(8, 8, 1)))
+__kernel void JOIN(cn0_cn_gpu,ALGO)(__global ulong *input, __global int *Scratchpad, __global ulong *states, uint Threads)
+{
+    const uint gIdx = getIdx();
+    __local ulong State_buf[8 * 25];
+	__local ulong* State = State_buf + get_local_id(0) * 25;
+
+#if(COMP_MODE==1)
+    // do not use early return here
+	if(gIdx < Threads)
+#endif
+    {
+        states += 25 * gIdx;
+
+#if(STRIDED_INDEX==0)
+        Scratchpad = (__global int*)((__global char*)Scratchpad + MEMORY * gIdx);
+#endif
+
+        if (get_local_id(1) == 0)
+        {
+
+// NVIDIA
+#ifdef __NV_CL_C_VERSION
+			for(uint i = 0; i < 8; ++i)
+				State[i] = input[i];
+#else
+            ((__local ulong8 *)State)[0] = vload8(0, input);
+#endif
+            State[8]  = input[8];
+            State[9]  = input[9];
+            State[10] = input[10];
+
+            ((__local uint *)State)[9]  &= 0x00FFFFFFU;
+            ((__local uint *)State)[9]  |= (((uint)get_global_id(0)) & 0xFF) << 24;
+            ((__local uint *)State)[10] &= 0xFF000000U;
+            /* explicit cast to `uint` is required because some OpenCL implementations (e.g. NVIDIA)
+             * handle get_global_id and get_global_offset as signed long long int and add
+             * 0xFFFFFFFF... to `get_global_id` if we set on host side a 32bit offset where the first bit is `1`
+             * (even if it is correct casted to unsigned on the host)
+             */
+            ((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8));
+
+            for (int i = 11; i < 25; ++i) {
+                State[i] = 0x00UL;
+            }
+
+            // Last bit of padding
+            State[16] = 0x8000000000000000UL;
+
+            keccakf1600_2(State);
+
+            #pragma unroll
+            for (int i = 0; i < 25; ++i) {
+                states[i] = State[i];
+            }
+        }
+	}
+}
+
+__attribute__((reqd_work_group_size(64, 1, 1)))
+__kernel void JOIN(cn00_cn_gpu,ALGO)(__global int *Scratchpad, __global ulong *states)
+{
+    const uint gIdx = getIdx() / 64;
+    __local ulong State[25];
+
+	states += 25 * gIdx;
+
+#if(STRIDED_INDEX==0)
+    Scratchpad = (__global int*)((__global char*)Scratchpad + MEMORY * gIdx);
+#endif
+
+	for(int i = get_local_id(0); i < 25; i+=get_local_size(0))
+		State[i] = states[i];
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+
+	for(uint i = get_local_id(0); i < MEMORY / 512; i += get_local_size(0))
+	{
+		generate_512(i, State, (__global ulong*)((__global uchar*)Scratchpad + i*512));
+	}
+}
+
+)==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl
new file mode 100644
index 000000000..9edb774ad
--- /dev/null
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight_r.cl
@@ -0,0 +1,220 @@
+R"===(
+/*
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#define cryptonight_r_wow 15
+#define cryptonight_r 16
+
+#define MEM_CHUNK (1 << MEM_CHUNK_EXPONENT)
+
+#if(STRIDED_INDEX==0)
+#   define IDX(x)	(x)
+#elif(STRIDED_INDEX==1)
+#	define IDX(x)   (mul24(((uint)(x)), Threads))
+#elif(STRIDED_INDEX==2)
+#   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
+#elif(STRIDED_INDEX==3)
+#	define IDX(x)   ((x) * WORKSIZE)
+#endif
+
+// __NV_CL_C_VERSION checks if NVIDIA opencl is used
+#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
+#	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idx1 ^ (N << 4))))
+#	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
+#else
+#	define SCRATCHPAD_CHUNK(N) (Scratchpad[IDX(((idx) >> 4) ^ N)])
+#endif
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void cn1_cryptonight_r(__global uint4 *Scratchpad, __global ulong *states, uint Threads)
+{
+    ulong a[2], b[4];
+    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
+
+#ifdef __NV_CL_C_VERSION
+	__local uint16 scratchpad_line_buf[WORKSIZE];
+ 	__local uint16* scratchpad_line = scratchpad_line_buf + get_local_id(0);
+#endif
+
+    const ulong gIdx = get_global_id(0) - get_global_offset(0);
+
+    for(int i = get_local_id(0); i < 256; i += WORKSIZE)
+    {
+        const uint tmp = AES0_C[i];
+        AES0[i] = tmp;
+        AES1[i] = rotate(tmp, 8U);
+        AES2[i] = rotate(tmp, 16U);
+        AES3[i] = rotate(tmp, 24U);
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+#   if (COMP_MODE == 1)
+    // do not use early return here
+    if (gIdx < Threads)
+#   endif
+    {
+        states += 25 * gIdx;
+
+#if(STRIDED_INDEX==0)
+		Scratchpad += gIdx * (MEMORY >> 4);
+#elif(STRIDED_INDEX==1)
+		Scratchpad += gIdx;
+#elif(STRIDED_INDEX==2)
+		Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
+#elif(STRIDED_INDEX==3)
+		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
+#endif
+
+        a[0] = states[0] ^ states[4];
+        a[1] = states[1] ^ states[5];
+
+        b[0] = states[2] ^ states[6];
+        b[1] = states[3] ^ states[7];
+        b[2] = states[8] ^ states[10];
+        b[3] = states[9] ^ states[11];
+    }
+
+    ulong2 bx0 = ((ulong2 *)b)[0];
+    ulong2 bx1 = ((ulong2 *)b)[1];
+
+    mem_fence(CLK_LOCAL_MEM_FENCE);
+
+#   if (COMP_MODE == 1)
+    // do not use early return here
+    if (gIdx < Threads)
+#   endif
+    {
+
+	uint r0 = as_uint2(states[12]).s0;
+	uint r1 = as_uint2(states[12]).s1;
+	uint r2 = as_uint2(states[13]).s0;
+	uint r3 = as_uint2(states[13]).s1;
+
+    #pragma unroll CN_UNROLL
+    for(int i = 0; i < ITERATIONS; ++i)
+    {
+#       ifdef __NV_CL_C_VERSION
+            uint idx  = a[0] & 0x1FFFC0;
+            uint idx1 = a[0] & 0x30;
+
+            *scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
+#       else
+            uint idx = a[0] & MASK;
+#       endif
+
+#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
+ 		*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
+#endif
+        uint4 c = SCRATCHPAD_CHUNK(0);
+        c = AES_Round(AES0, AES1, AES2, AES3, c, ((uint4 *)a)[0]);
+
+        {
+            const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+            const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+            const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+
+#if (ALGO == cryptonight_r)
+            c ^= as_uint4(chunk1) ^ as_uint4(chunk2) ^ as_uint4(chunk3);
+#endif
+
+            SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
+            SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
+            SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+        }
+
+        SCRATCHPAD_CHUNK(0) = as_uint4(bx0) ^ c;
+
+#       ifdef __NV_CL_C_VERSION
+            *(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
+
+            idx = as_ulong2(c).s0 & 0x1FFFC0;
+            idx1 = as_ulong2(c).s0 & 0x30;
+
+            *scratchpad_line = *(__global uint16*)((__global uchar*)(Scratchpad) + idx);
+#       else
+            idx = as_ulong2(c).s0 & MASK;
+#       endif
+
+        uint4 tmp = SCRATCHPAD_CHUNK(0);
+
+        tmp.s0 ^= r0 + r1;
+        tmp.s1 ^= r2 + r3;
+        const uint r4 = as_uint2(a[0]).s0;
+        const uint r5 = as_uint2(a[1]).s0;
+        const uint r6 = as_uint4(bx0).s0;
+        const uint r7 = as_uint4(bx1).s0;
+#if (ALGO == cryptonight_r)
+        const uint r8 = as_uint4(bx1).s2;
+#endif
+#define ROT_BITS 32
+
+	XMRSTAK_INCLUDE_RANDOM_MATH
+
+#if (ALGO == cryptonight_r)
+
+        const uint2 al = (uint2)(as_uint2(a[0]).s0 ^ r2, as_uint2(a[0]).s1 ^ r3);
+        const uint2 ah = (uint2)(as_uint2(a[1]).s0 ^ r0, as_uint2(a[1]).s1 ^ r1);
+#endif
+
+        ulong2 t;
+        t.s0 = mul_hi(as_ulong2(c).s0, as_ulong2(tmp).s0);
+        t.s1 = as_ulong2(c).s0 * as_ulong2(tmp).s0;
+        {
+            const ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1))
+#if (ALGO == cryptonight_r_wow)
+            ^ t
+#endif
+            ;
+            const ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+#if (ALGO == cryptonight_r_wow)
+            t ^= chunk2;
+#endif
+            const ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+
+#if (ALGO == cryptonight_r)
+            c ^= as_uint4(chunk1) ^ as_uint4(chunk2) ^ as_uint4(chunk3);
+#endif
+
+            SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + bx1);
+            SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + bx0);
+            SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+        }
+
+#if (ALGO == cryptonight_r)
+        a[1] = as_ulong(ah) + t.s1;
+        a[0] = as_ulong(al) + t.s0;
+#else
+        a[1] += t.s1;
+        a[0] += t.s0;
+#endif
+
+        SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
+
+#       ifdef __NV_CL_C_VERSION
+            *(__global uint16*)((__global uchar*)(Scratchpad) + idx) = *scratchpad_line;
+#       endif
+
+        ((uint4 *)a)[0] ^= tmp;
+        bx1 = bx0;
+        bx0 = as_ulong2(c);
+    }
+
+#   undef SCRATCHPAD_CHUNK
+    }
+    mem_fence(CLK_GLOBAL_MEM_FENCE);
+}
+)==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
index 161f2f55d..4469b0670 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
@@ -1,7 +1,6 @@
 R"===(
-#ifndef FAST_DIV_HEAVY_CL
-#define FAST_DIV_HEAVY_CL
 
+#if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 inline long fast_div_heavy(long _a, int _b)
 {
 	long a = abs(_a);
@@ -19,6 +18,5 @@ inline long fast_div_heavy(long _a, int _b)
 	const long q = q1 + q2 + q3;
 	return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q;
 }
-
 #endif
 )==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
index c170387b4..8878db618 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -3,8 +3,7 @@ R"===(
  * @author SChernykh
  */
 
-// cryptonight_monero_v8
-#if(ALGO==11)
+#if(ALGO == cryptonight_monero_v8)
 
 static const __constant uint RCP_C[256] =
 {
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index ba4cebb7b..ea688e053 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -83,10 +83,13 @@ class autoAdjust
 
 		constexpr size_t byteToMiB = 1024u * 1024u;
 
-		size_t hashMemSize = std::max(
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-		);
+		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+		size_t hashMemSize = 0;
+		for(const auto algo : neededAlgorithms)
+		{
+			hashMemSize = std::max(hashMemSize, algo.Mem());
+		}
 
 		std::string conf;
 		for(auto& ctx : devVec)
@@ -128,18 +131,17 @@ class autoAdjust
 			}
 
 			// check if cryptonight_monero_v8 is selected for the user or dev pool
-			bool useCryptonight_v8 =
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8 ||
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;
+			bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end());
 
 			// true for all cryptonight_heavy derivates since we check the user and dev pool
-			bool useCryptonight_heavy =
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_heavy ||
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_heavy ||
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_heavy ||
-				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_heavy;
+			bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end();
+
+			// true for cryptonight_gpu as main user pool algorithm
+			bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;
+
+			bool useCryptonight_r = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r;
+
+			bool useCryptonight_r_wow = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r_wow;
 
 			// set strided index to default
 			ctx.stridedIndex = 1;
@@ -149,14 +151,28 @@ class autoAdjust
 				ctx.stridedIndex = 0;
 
 			// use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2`
-			if(useCryptonight_v8)
+			if(useCryptonight_v8 || useCryptonight_r || useCryptonight_r_wow)
 				ctx.stridedIndex = 2;
 			else if(useCryptonight_heavy)
 				ctx.stridedIndex = 3;
 
-			// increase all intensity limits by two for aeon
-			if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
-				maxThreads *= 2u;
+			if(hashMemSize < CN_MEMORY)
+			{
+				size_t factor = CN_MEMORY / hashMemSize;
+				// increase all intensity relative to the original scratchpad size
+				maxThreads *= factor;
+			}
+
+			uint32_t numUnroll = 8;
+
+			if(useCryptonight_gpu)
+			{
+				// 6 waves per compute unit are a good value (based on profiling)
+				// @todo check again after all optimizations
+				maxThreads = ctx.computeUnits * 6 * 8;
+				ctx.stridedIndex = 0;
+				numUnroll = 1;
+			}
 
 			// keep 128MiB memory free (value is randomly chosen) from the max available memory
 			const size_t maxAvailableFreeMem = ctx.freeMem - minFreeMem;
@@ -164,7 +180,7 @@ class autoAdjust
 			size_t memPerThread = std::min(ctx.maxMemPerAlloc, maxAvailableFreeMem);
 
 			uint32_t numThreads = 1u;
-			if(ctx.isAMD)
+			if(ctx.isAMD && !useCryptonight_gpu)
 			{
 				numThreads = 2;
 				size_t memDoubleThread = maxAvailableFreeMem / numThreads;
@@ -199,7 +215,7 @@ class autoAdjust
 					conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
 						"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
 						"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
-						"    \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
+						"    \"unroll\" : " + std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
 						"  },\n";
 				}
 			}
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index b0f4e6ecd..eb0009413 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -183,7 +183,11 @@ void minethd::work_main()
 	}
 	// start with root algorithm and switch later if fork version is reached
 	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
-	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+
+	cpu::minethd::cn_on_new_job set_job;
+
+	cn_hash_fun hash_fun;
+	cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 
 	uint8_t version = 0;
 	size_t lastPoolId = 0;
@@ -224,23 +228,26 @@ void minethd::work_main()
 			if(new_version >= coinDesc.GetMiningForkVersion())
 			{
 				miner_algo = coinDesc.GetMiningAlgo();
-				hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+				cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 			}
 			else
 			{
 				miner_algo = coinDesc.GetMiningAlgoRoot();
-				hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+				cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 			}
 			lastPoolId = oWork.iPoolId;
 			version = new_version;
 		}
 
+		if(set_job != nullptr)
+			set_job(oWork, &cpu_ctx);
+
 		size_t round_ctr = 0;
 
 		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
 		uint64_t target = oWork.iTarget;
 
-		XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo);
+		XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo, cpu_ctx->cn_r_ctx.height);
 
 		if(oWork.bNiceHash)
 			pGpuCtx->Nonce = *(uint32_t*)(oWork.bWorkBlob + 39);
@@ -275,7 +282,7 @@ void minethd::work_main()
 
 				*(uint32_t*)(bWorkBlob + 39) = results[i];
 
-				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx);
+				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
 				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
 					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
@@ -327,7 +334,7 @@ void minethd::work_main()
 						);
 					}
 					// update gpu with new intensity
-					XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo);
+					XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo, cpu_ctx->cn_r_ctx.height);
 				}
 				// use 3 rounds to warm up with the new intensity
 				else if(cntTestRounds == autoTune + 3)
diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp
index 74ab5fb60..402d63cd6 100644
--- a/xmrstak/backend/amd/minethd.hpp
+++ b/xmrstak/backend/amd/minethd.hpp
@@ -24,7 +24,7 @@ class minethd  : public iBackend
 	static bool init_gpus();
 
 private:
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&);
 
 	minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::thd_cfg cfg);
 
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index e7f3e9148..ba0e6984f 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -28,11 +28,15 @@ class autoAdjust
 
 	bool printConfig()
 	{
+		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+		size_t hashMemSize = 0;
+		for(const auto algo : neededAlgorithms)
+		{
+			hashMemSize = std::max(hashMemSize, algo.Mem());
+		}
+		const size_t hashMemSizeKB = hashMemSize / 1024u;
 
-		const size_t hashMemSizeKB = std::max(
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-		) / 1024u;
 		const size_t halfHashMemSizeKB = hashMemSizeKB / 2u;
 
 		configEditor configTpl{};
@@ -45,7 +49,14 @@ class autoAdjust
 
 		std::string conf;
 
+		// if cryptonight_gpu is used we will disable cpu mining but provide a inactive config
+		bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;
 
+		if(useCryptonight_gpu)
+		{
+			printer::inst()->print_msg(L0, "WARNING: CPU mining will be disabled because cryptonight_gpu is not suitable for CPU mining. You can uncomment the auto generated config in %s to enable CPU mining.", params::inst().configFileCPU.c_str());
+			conf += "/*\n//CPU config is disabled by default because cryptonight_gpu is not suitable for CPU mining.\n";
+		}
 		if(!detectL3Size() || L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048u))
 		{
 			if(L3KB_size < halfHashMemSizeKB || L3KB_size > (halfHashMemSizeKB * 2048))
@@ -96,6 +107,9 @@ class autoAdjust
 			}
 		}
 
+		if(useCryptonight_gpu)
+			conf += "*/\n";
+
 		configTpl.replace("CPUCONFIG",conf);
 		configTpl.write(params::inst().configFileCPU);
 		printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str());
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index b61582588..f09b1ebc0 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -28,10 +28,12 @@ class autoAdjust
 
 	autoAdjust()
 	{
-		hashMemSize = std::max(
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-		);
+		auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+		for(const auto algo : neededAlgorithms)
+		{
+			hashMemSize = std::max(hashMemSize, algo.Mem());
+		}
 		halfHashMemSize = hashMemSize / 2u;
 	}
 
@@ -51,6 +53,15 @@ class autoAdjust
 		;
 		configTpl.set( std::string(tpl) );
 
+		// if cryptonight_gpu is used we will disable cpu mining but provide a inactive config
+		bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;
+
+		if(useCryptonight_gpu)
+		{
+			printer::inst()->print_msg(L0, "WARNING: CPU mining will be disabled because cryptonight_gpu is not suitable for CPU mining. You can uncomment the auto generated config in %s to enable CPU mining.", params::inst().configFileCPU.c_str());
+			conf += "/*\n//CPU config is disabled by default because cryptonight_gpu is not suitable for CPU mining.\n";
+		}
+
 		try
 		{
 			std::vector<hwloc_obj_t> tlcs;
@@ -83,6 +94,9 @@ class autoAdjust
 			printer::inst()->print_msg(L0, "Autoconf FAILED: %s. Create config for a single thread.", err.what());
 		}
 
+		if(useCryptonight_gpu)
+			conf += "*/\n";
+
 		configTpl.replace("CPUCONFIG",conf);
 		configTpl.write(params::inst().configFileCPU);
 		printer::inst()->print_msg(L0, "CPU configuration stored in file '%s'", params::inst().configFileCPU.c_str());
@@ -93,8 +107,8 @@ class autoAdjust
 	}
 
 private:
-	size_t hashMemSize;
-	size_t halfHashMemSize;
+	size_t hashMemSize = 0;
+	size_t halfHashMemSize = 0;
 
 	std::vector<uint32_t> results;
 
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu.hpp b/xmrstak/backend/cpu/crypto/cn_gpu.hpp
new file mode 100644
index 000000000..5844d3814
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/cn_gpu.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "xmrstak/backend/cryptonight.hpp"
+#include <stdint.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <malloc.h>
+#include <intrin.h>
+#define HAS_WIN_INTRIN_API
+#endif
+
+#ifdef __GNUC__
+#include <x86intrin.h>
+#if !defined(HAS_WIN_INTRIN_API)
+#include <cpuid.h>
+#endif // !defined(HAS_WIN_INTRIN_API)
+#endif // __GNUC__
+
+inline void cngpu_cpuid(uint32_t eax, int32_t ecx, int32_t val[4])
+{
+	val[0] = 0;
+	val[1] = 0;
+	val[2] = 0;
+	val[3] = 0;
+
+#if defined(HAS_WIN_INTRIN_API)
+	__cpuidex(val, eax, ecx);
+#else
+	__cpuid_count(eax, ecx, val[0], val[1], val[2], val[3]);
+#endif
+}
+
+inline bool cngpu_check_avx2()
+{
+	int32_t cpu_info[4];
+	cngpu_cpuid(7, 0, cpu_info);
+	return (cpu_info[1] & (1 << 5)) != 0;
+}
+
+void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo);
+
+void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo);
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
new file mode 100644
index 000000000..8b4aefe13
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/cn_gpu_avx.cpp
@@ -0,0 +1,177 @@
+#include "cn_gpu.hpp"
+#include "../../cryptonight.hpp"
+
+#pragma GCC target ("avx2")
+
+inline void prep_dv_avx(__m256i* idx, __m256i& v, __m256& n01)
+{
+	v = _mm256_load_si256(idx);
+	n01 = _mm256_cvtepi32_ps(v);
+}
+
+inline __m256 fma_break(const __m256& x)
+{
+	// Break the dependency chain by setitng the exp to ?????01
+	__m256 xx = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFEFFFFFF)), x);
+	return _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x00800000)), xx);
+}
+
+// 14
+inline void sub_round(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& n, __m256& d, __m256& c)
+{
+	__m256 nn = _mm256_mul_ps(n0, c);
+	nn = _mm256_mul_ps(_mm256_add_ps(n1, c), _mm256_mul_ps(nn, nn));
+	nn = fma_break(nn);
+	n = _mm256_add_ps(n, nn);
+
+	__m256 dd = _mm256_mul_ps(n2, c);
+	dd = _mm256_mul_ps(_mm256_sub_ps(n3, c), _mm256_mul_ps(dd, dd));
+	dd = fma_break(dd);
+	d = _mm256_add_ps(d, dd);
+
+	//Constant feedback
+	c = _mm256_add_ps(c, rnd_c);
+	c = _mm256_add_ps(c, _mm256_set1_ps(0.734375f));
+	__m256 r = _mm256_add_ps(nn, dd);
+	r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r);
+	r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r);
+	c = _mm256_add_ps(c, r);
+}
+
+// 14*8 + 2 = 112
+inline void round_compute(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3, const __m256& rnd_c, __m256& c, __m256& r)
+{
+	__m256 n = _mm256_setzero_ps(), d = _mm256_setzero_ps();
+
+	sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
+	sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
+	sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
+	sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
+	sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
+	sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
+	sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
+	sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
+
+	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
+	d = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0xFF7FFFFF)), d);
+	d = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), d);
+	r = _mm256_add_ps(r, _mm256_div_ps(n, d));
+}
+
+// 112×4 = 448
+template <bool add>
+inline __m256i double_comupte(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3,
+							  float lcnt, float hcnt, const __m256& rnd_c, __m256& sum)
+{
+	__m256 c = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_set1_ps(lcnt)), _mm_set1_ps(hcnt), 1);
+	__m256 r = _mm256_setzero_ps();
+
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+
+	// do a quick fmod by setting exp to 2
+	r = _mm256_and_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x807FFFFF)), r);
+	r = _mm256_or_ps(_mm256_castsi256_ps(_mm256_set1_epi32(0x40000000)), r);
+
+	if(add)
+		sum = _mm256_add_ps(sum, r);
+	else
+		sum = r;
+
+	r = _mm256_mul_ps(r, _mm256_set1_ps(536870880.0f)); // 35
+	return _mm256_cvttps_epi32(r);
+}
+
+template <size_t rot>
+inline void double_comupte_wrap(const __m256& n0, const __m256& n1, const __m256& n2, const __m256& n3,
+								float lcnt, float hcnt, const __m256& rnd_c, __m256& sum, __m256i& out)
+{
+	__m256i r = double_comupte<rot % 2 != 0>(n0, n1, n2, n3, lcnt, hcnt, rnd_c, sum);
+	if(rot != 0)
+		r = _mm256_or_si256(_mm256_bslli_epi128(r, 16 - rot), _mm256_bsrli_epi128(r, rot));
+
+	out = _mm256_xor_si256(out, r);
+}
+
+
+inline __m256i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m256i*>(lpad + (idx & mask) + n*16); }
+
+
+void cn_gpu_inner_avx(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo)
+{
+	const uint32_t ITER = algo.Iter();
+	const uint32_t mask = algo.Mask();
+
+	uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8;
+	__m256i* idx0 = scratchpad_ptr(lpad, s, 0, mask);
+	__m256i* idx2 = scratchpad_ptr(lpad, s, 2, mask);
+	__m256 sum0 = _mm256_setzero_ps();
+
+	for(size_t i = 0; i < ITER; i++)
+	{
+		__m256i v01, v23;
+		__m256 suma, sumb, sum1;
+		__m256 rc = sum0;
+
+		__m256 n01, n23;
+		__m256 d01, d23;
+		prep_dv_avx(idx0, v01, n01);
+		prep_dv_avx(idx2, v23, n23);
+
+		__m256i out, out2;
+		__m256 n10, n22, n33;
+		n10 = _mm256_permute2f128_ps(n01, n01, 0x01);
+		n22 = _mm256_permute2f128_ps(n23, n23, 0x00);
+		n33 = _mm256_permute2f128_ps(n23, n23, 0x11);
+
+		out = _mm256_setzero_si256();
+		double_comupte_wrap<0>(n01, n10, n22, n33, 1.3437500f, 1.4296875f, rc, suma, out);
+		double_comupte_wrap<1>(n01, n22, n33, n10, 1.2812500f, 1.3984375f, rc, suma, out);
+		double_comupte_wrap<2>(n01, n33, n10, n22, 1.3593750f, 1.3828125f, rc, sumb, out);
+		double_comupte_wrap<3>(n01, n33, n22, n10, 1.3671875f, 1.3046875f, rc, sumb, out);
+		_mm256_store_si256(idx0, _mm256_xor_si256(v01, out));
+		sum0 = _mm256_add_ps(suma, sumb);
+		out2 = out;
+
+		__m256 n11, n02, n30;
+		n11 = _mm256_permute2f128_ps(n01, n01, 0x11);
+		n02 = _mm256_permute2f128_ps(n01, n23, 0x20);
+		n30 = _mm256_permute2f128_ps(n01, n23, 0x03);
+
+		out = _mm256_setzero_si256();
+		double_comupte_wrap<0>(n23, n11, n02, n30, 1.4140625f, 1.3203125f, rc, suma, out);
+		double_comupte_wrap<1>(n23, n02, n30, n11, 1.2734375f, 1.3515625f, rc, suma, out);
+		double_comupte_wrap<2>(n23, n30, n11, n02, 1.2578125f, 1.3359375f, rc, sumb, out);
+		double_comupte_wrap<3>(n23, n30, n02, n11, 1.2890625f, 1.4609375f, rc, sumb, out);
+		_mm256_store_si256(idx2, _mm256_xor_si256(v23, out));
+		sum1 = _mm256_add_ps(suma, sumb);
+
+		out2 = _mm256_xor_si256(out2, out);
+		out2 = _mm256_xor_si256(_mm256_permute2x128_si256(out2,out2,0x41), out2);
+		suma = _mm256_permute2f128_ps(sum0, sum1, 0x30);
+		sumb = _mm256_permute2f128_ps(sum0, sum1, 0x21);
+		sum0 = _mm256_add_ps(suma, sumb);
+		sum0 = _mm256_add_ps(sum0, _mm256_permute2f128_ps(sum0, sum0, 0x41));
+
+		// Clear the high 128 bits
+		__m128 sum = _mm256_castps256_ps128(sum0);
+
+		sum = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum); // take abs(va) by masking the float sign bit
+		// vs range 0 - 64
+		__m128i v0 = _mm_cvttps_epi32(_mm_mul_ps(sum, _mm_set1_ps(16777216.0f)));
+		v0 = _mm_xor_si128(v0, _mm256_castsi256_si128(out2));
+		__m128i v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3));
+		v0 = _mm_xor_si128(v0, v1);
+		v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1));
+		v0 = _mm_xor_si128(v0, v1);
+
+		// vs is now between 0 and 1
+		sum = _mm_div_ps(sum, _mm_set1_ps(64.0f));
+		sum0 = _mm256_insertf128_ps(_mm256_castps128_ps256(sum), sum, 1);
+		uint32_t n = _mm_cvtsi128_si32(v0);
+		idx0 = scratchpad_ptr(lpad, n, 0, mask);
+		idx2 = scratchpad_ptr(lpad, n, 2, mask);
+	}
+}
diff --git a/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
new file mode 100644
index 000000000..c8627d8b8
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/cn_gpu_ssse3.cpp
@@ -0,0 +1,181 @@
+#include "cn_gpu.hpp"
+#include "../../cryptonight.hpp"
+
+#pragma GCC target ("sse2")
+
+inline void prep_dv(__m128i* idx, __m128i& v, __m128& n)
+{
+	v = _mm_load_si128(idx);
+	n = _mm_cvtepi32_ps(v);
+}
+
+inline __m128 fma_break(__m128 x)
+{
+	// Break the dependency chain by setitng the exp to ?????01
+	x = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFEFFFFFF)), x);
+	return _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x00800000)), x);
+}
+
+// 14
+inline void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c)
+{
+	n1 = _mm_add_ps(n1, c);
+	__m128 nn = _mm_mul_ps(n0, c);
+	nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
+	nn = fma_break(nn);
+	n = _mm_add_ps(n, nn);
+
+	n3 = _mm_sub_ps(n3, c);
+	__m128 dd = _mm_mul_ps(n2, c);
+	dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
+	dd = fma_break(dd);
+	d = _mm_add_ps(d, dd);
+
+	//Constant feedback
+	c = _mm_add_ps(c, rnd_c);
+	c = _mm_add_ps(c, _mm_set1_ps(0.734375f));
+	__m128 r = _mm_add_ps(nn, dd);
+	r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r);
+	r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r);
+	c = _mm_add_ps(c, r);
+}
+
+// 14*8 + 2 = 112
+inline void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r)
+{
+	__m128 n = _mm_setzero_ps(), d = _mm_setzero_ps();
+
+	sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
+	sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
+	sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
+	sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
+	sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
+	sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
+	sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
+	sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
+
+	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
+	d = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0xFF7FFFFF)), d);
+	d = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), d);
+	r =_mm_add_ps(r, _mm_div_ps(n,d));
+}
+
+// 112×4 = 448
+template<bool add>
+inline __m128i single_comupte(__m128 n0, __m128 n1,  __m128 n2,  __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
+{
+	__m128 c = _mm_set1_ps(cnt);
+	__m128 r = _mm_setzero_ps();
+
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	round_compute(n0, n1, n2, n3, rnd_c, c, r);
+
+	// do a quick fmod by setting exp to 2
+	r = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x807FFFFF)), r);
+	r = _mm_or_ps(_mm_castsi128_ps(_mm_set1_epi32(0x40000000)), r);
+
+	if(add)
+		sum = _mm_add_ps(sum, r);
+	else
+		sum = r;
+
+	r = _mm_mul_ps(r, _mm_set1_ps(536870880.0f)); // 35
+	return _mm_cvttps_epi32(r);
+}
+
+template<size_t rot>
+inline void single_comupte_wrap(__m128 n0, __m128 n1, __m128 n2,  __m128 n3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
+{
+	__m128i r = single_comupte<rot % 2 != 0>(n0, n1, n2, n3, cnt, rnd_c, sum);
+	if(rot != 0)
+		r = _mm_or_si128(_mm_slli_si128(r, 16 - rot), _mm_srli_si128(r, rot));
+	out = _mm_xor_si128(out, r);
+}
+
+inline __m128i* scratchpad_ptr(uint8_t* lpad, uint32_t idx, size_t n, const uint32_t mask) { return reinterpret_cast<__m128i*>(lpad + (idx & mask) + n*16); }
+
+void cn_gpu_inner_ssse3(const uint8_t* spad, uint8_t* lpad, const xmrstak_algo& algo)
+{
+	const uint32_t ITER = algo.Iter();
+	const uint32_t mask = algo.Mask();
+
+	uint32_t s = reinterpret_cast<const uint32_t*>(spad)[0] >> 8;
+	__m128i* idx0 = scratchpad_ptr(lpad, s, 0, mask);
+	__m128i* idx1 = scratchpad_ptr(lpad, s, 1, mask);
+	__m128i* idx2 = scratchpad_ptr(lpad, s, 2, mask);
+	__m128i* idx3 = scratchpad_ptr(lpad, s, 3, mask);
+	__m128 sum0 = _mm_setzero_ps();
+
+	for(size_t i = 0; i < ITER; i++)
+	{
+		__m128 n0, n1, n2, n3;
+		__m128i v0, v1, v2, v3;
+		__m128 suma, sumb, sum1, sum2, sum3;
+
+		prep_dv(idx0, v0, n0);
+		prep_dv(idx1, v1, n1);
+		prep_dv(idx2, v2, n2);
+		prep_dv(idx3, v3, n3);
+		__m128 rc = sum0;
+
+		__m128i out, out2;
+		out = _mm_setzero_si128();
+		single_comupte_wrap<0>(n0, n1, n2, n3, 1.3437500f, rc, suma, out);
+		single_comupte_wrap<1>(n0, n2, n3, n1, 1.2812500f, rc, suma, out);
+		single_comupte_wrap<2>(n0, n3, n1, n2, 1.3593750f, rc, sumb, out);
+		single_comupte_wrap<3>(n0, n3, n2, n1, 1.3671875f, rc, sumb, out);
+		sum0 = _mm_add_ps(suma, sumb);
+		_mm_store_si128(idx0, _mm_xor_si128(v0, out));
+		out2 = out;
+
+		out = _mm_setzero_si128();
+		single_comupte_wrap<0>(n1, n0, n2, n3, 1.4296875f, rc, suma, out);
+		single_comupte_wrap<1>(n1, n2, n3, n0, 1.3984375f, rc, suma, out);
+		single_comupte_wrap<2>(n1, n3, n0, n2, 1.3828125f, rc, sumb, out);
+		single_comupte_wrap<3>(n1, n3, n2, n0, 1.3046875f, rc, sumb, out);
+		sum1 = _mm_add_ps(suma, sumb);
+		_mm_store_si128(idx1, _mm_xor_si128(v1, out));
+		out2 = _mm_xor_si128(out2, out);
+
+		out = _mm_setzero_si128();
+		single_comupte_wrap<0>(n2, n1, n0, n3, 1.4140625f, rc, suma, out);
+		single_comupte_wrap<1>(n2, n0, n3, n1, 1.2734375f, rc, suma, out);
+		single_comupte_wrap<2>(n2, n3, n1, n0, 1.2578125f, rc, sumb, out);
+		single_comupte_wrap<3>(n2, n3, n0, n1, 1.2890625f, rc, sumb, out);
+		sum2 = _mm_add_ps(suma, sumb);
+		_mm_store_si128(idx2, _mm_xor_si128(v2, out));
+		out2 = _mm_xor_si128(out2, out);
+
+		out = _mm_setzero_si128();
+		single_comupte_wrap<0>(n3, n1, n2, n0, 1.3203125f, rc, suma, out);
+		single_comupte_wrap<1>(n3, n2, n0, n1, 1.3515625f, rc, suma, out);
+		single_comupte_wrap<2>(n3, n0, n1, n2, 1.3359375f, rc, sumb, out);
+		single_comupte_wrap<3>(n3, n0, n2, n1, 1.4609375f, rc, sumb, out);
+		sum3 = _mm_add_ps(suma, sumb);
+		_mm_store_si128(idx3, _mm_xor_si128(v3, out));
+		out2 = _mm_xor_si128(out2, out);
+		sum0 = _mm_add_ps(sum0, sum1);
+		sum2 = _mm_add_ps(sum2, sum3);
+		sum0 = _mm_add_ps(sum0, sum2);
+
+		sum0 = _mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)), sum0); // take abs(va) by masking the float sign bit
+		// vs range 0 - 64
+		n0 = _mm_mul_ps(sum0, _mm_set1_ps(16777216.0f));
+		v0 = _mm_cvttps_epi32(n0);
+		v0 = _mm_xor_si128(v0, out2);
+		v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 2, 3));
+		v0 = _mm_xor_si128(v0, v1);
+		v1 = _mm_shuffle_epi32(v0, _MM_SHUFFLE(0, 1, 0, 1));
+		v0 = _mm_xor_si128(v0, v1);
+
+		// vs is now between 0 and 1
+		sum0 = _mm_div_ps(sum0, _mm_set1_ps(64.0f));
+		uint32_t n = _mm_cvtsi128_si32(v0);
+		idx0 = scratchpad_ptr(lpad, n, 0, mask);
+		idx1 = scratchpad_ptr(lpad, n, 1, mask);
+		idx2 = scratchpad_ptr(lpad, n, 2, mask);
+		idx3 = scratchpad_ptr(lpad, n, 3, mask);
+	}
+}
diff --git a/xmrstak/backend/cpu/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h
index 5c9a73332..a7c77cdac 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight.h
@@ -1,29 +1,31 @@
-#ifndef __CRYPTONIGHT_H_INCLUDED
-#define __CRYPTONIGHT_H_INCLUDED
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
+#pragma once
 #include <stddef.h>
 #include <inttypes.h>
 
-typedef struct {
+#include "variant4_random_math.h"
+
+struct extra_ctx_r
+{
+	uint64_t height = 0;
+	// the buffer must be able to hold NUM_INSTRUCTIONS_MAX and a termination instruction
+	V4_Instruction code[NUM_INSTRUCTIONS_MAX + 1];
+};
+
+struct cryptonight_ctx
+{
 	uint8_t hash_state[224]; // Need only 200, explicit align
 	uint8_t* long_state;
 	uint8_t ctx_info[24]; //Use some of the extra memory for flags
-} cryptonight_ctx;
+	extra_ctx_r cn_r_ctx;
+};
 
-typedef struct {
+struct alloc_msg
+{
 	const char* warning;
-} alloc_msg;
+};
 
 size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
 cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg);
 void cryptonight_free_ctx(cryptonight_ctx* ctx);
 
-#ifdef __cplusplus
-}
-#endif
 
-#endif
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 06cbe8740..43f719873 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -17,6 +17,8 @@
 
 #include "cryptonight.h"
 #include "xmrstak/backend/cryptonight.hpp"
+#include "../../miner_work.hpp"
+#include "cn_gpu.hpp"
 #include <memory.h>
 #include <stdio.h>
 #include <cfenv>
@@ -164,9 +166,11 @@ inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3
 	x7 = _mm_xor_si128(x7, tmp0);
 }
 
-template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO>
-void cn_explode_scratchpad(const __m128i* input, __m128i* output)
+template<bool SOFT_AES, bool PREFETCH, xmrstak_algo_id ALGO>
+void cn_explode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo)
 {
+	constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast;
+
 	// This is more than we have registers, compiler will assign 2 keys on the stack
 	__m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
 	__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
@@ -182,7 +186,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
 	xin6 = _mm_load_si128(input + 10);
 	xin7 = _mm_load_si128(input + 11);
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+	if(HEAVY_MIX)
 	{
 		for(size_t i=0; i < 16; i++)
 		{
@@ -216,6 +220,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
 		}
 	}
 
+	const size_t MEM = algo.Mem();
 	for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 	{
 		if(SOFT_AES)
@@ -263,9 +268,46 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
 	}
 }
 
-template<size_t MEM, bool SOFT_AES, bool PREFETCH, xmrstak_algo ALGO>
-void cn_implode_scratchpad(const __m128i* input, __m128i* output)
+template<bool PREFETCH, xmrstak_algo_id ALGO>
+void cn_explode_scratchpad_gpu(const uint8_t* input, uint8_t* output, const xmrstak_algo& algo)
+{
+	constexpr size_t hash_size = 200; // 25x8 bytes
+	alignas(128) uint64_t hash[25];
+	const size_t mem = algo.Mem();
+
+	for (uint64_t i = 0; i < mem / 512; i++)
+	{
+		memcpy(hash, input, hash_size);
+		hash[0] ^= i;
+
+		keccakf(hash, 24);
+		memcpy(output, hash, 160);
+		output+=160;
+
+		keccakf(hash, 24);
+		memcpy(output, hash, 176);
+		output+=176;
+
+		keccakf(hash, 24);
+		memcpy(output, hash, 176);
+		output+=176;
+
+		if(PREFETCH)
+		{
+			_mm_prefetch((const char*)output - 512, _MM_HINT_T2);
+			_mm_prefetch((const char*)output - 384, _MM_HINT_T2);
+			_mm_prefetch((const char*)output - 256, _MM_HINT_T2);
+			_mm_prefetch((const char*)output - 128, _MM_HINT_T2);
+		}
+	}
+}
+
+template<bool SOFT_AES, bool PREFETCH, xmrstak_algo_id ALGO>
+void cn_implode_scratchpad(const __m128i* input, __m128i* output, const xmrstak_algo& algo)
 {
+	constexpr bool HEAVY_MIX = ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
+		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast || ALGO == cryptonight_gpu;
+
 	// This is more than we have registers, compiler will assign 2 keys on the stack
 	__m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
 	__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
@@ -281,6 +323,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 	xout6 = _mm_load_si128(output + 10);
 	xout7 = _mm_load_si128(output + 11);
 
+	const size_t MEM = algo.Mem();
 	for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 	{
 		if(PREFETCH)
@@ -326,11 +369,11 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 			aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
 		}
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+		if(HEAVY_MIX)
 			mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
 	}
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+	if(HEAVY_MIX)
 	{
 		for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 		{
@@ -377,7 +420,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 				aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
 			}
 
-			if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+			if(HEAVY_MIX)
 				mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
 		}
 
@@ -465,7 +508,7 @@ inline __m128i aes_round_bittube2(const __m128i& val, const __m128i& key)
 	return _mm_load_si128((__m128i*)k);
 }
 
-template<xmrstak_algo ALGO>
+template<xmrstak_algo_id ALGO>
 inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
 {
 	mem_out[0] = _mm_cvtsi128_si64(tmp);
@@ -543,9 +586,39 @@ inline void set_float_rounding_mode()
 #endif
 }
 
-#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1) \
+inline void set_float_rounding_mode_nearest()
+{
+#ifdef _MSC_VER
+	_control87(RC_NEAR, MCW_RC);
+#else
+	std::fesetround(FE_TONEAREST);
+#endif
+}
+
+inline __m128 _mm_set1_ps_epi32(uint32_t x)
+{
+	return _mm_castsi128_ps(_mm_set1_epi32(x));
+}
+
+inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
+{
+	__m128 r = _mm_cvtepi32_ps(cx);
+	__m128 c_old = conc_var;
+	r = _mm_add_ps(r, conc_var);
+	r = _mm_mul_ps(r, _mm_mul_ps(r, r));
+	r = _mm_and_ps(_mm_set1_ps_epi32(0x807FFFFF), r);
+	r = _mm_or_ps(_mm_set1_ps_epi32(0x40000000), r);
+	conc_var = _mm_add_ps(conc_var, r);
+
+	c_old = _mm_and_ps(_mm_set1_ps_epi32(0x807FFFFF), c_old);
+	c_old = _mm_or_ps(_mm_set1_ps_epi32(0x40000000), c_old);
+	__m128 nc = _mm_mul_ps(c_old, _mm_set1_ps(536870880.0f));
+	cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc));
+}
+
+#define CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx) \
 	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
-	if(ALGO == cryptonight_monero_v8) \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \
 	{ \
 		const uint64_t idx1 = idx0 & MASK; \
 		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \
@@ -554,11 +627,13 @@ inline void set_float_rounding_mode()
 		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
 		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
 		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+		if (ALGO == cryptonight_r) \
+			cx = _mm_xor_si128(_mm_xor_si128(cx, chunk3), _mm_xor_si128(chunk1, chunk2)); \
 	}
 
 #define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \
 	/* Shuffle the other 3x16 byte chunks in the current 64-byte cache line */ \
-	if(ALGO == cryptonight_monero_v8) \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r_wow) \
 	{ \
 		const uint64_t idx1 = idx0 & MASK; \
 		const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
@@ -595,6 +670,23 @@ inline void set_float_rounding_mode()
 		assign(sqrt_result, int_sqrt33_1_double_precision(cx_64 + division_result)); \
 	}
 
+#define CN_R_RANDOM_MATH(n, al, ah, cl, bx0, bx1, cn_r_data) \
+	if (ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \
+	{ \
+		cl ^= (cn_r_data[0] + cn_r_data[1]) | ((uint64_t)(cn_r_data[2] + cn_r_data[3]) << 32); \
+		cn_r_data[4] = static_cast<uint32_t>(al); \
+		cn_r_data[5] = static_cast<uint32_t>(ah); \
+		cn_r_data[6] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx0)); \
+		cn_r_data[7] = static_cast<uint32_t>(_mm_cvtsi128_si32(bx1)); \
+		cn_r_data[8] = static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \
+		v4_random_math(ctx[n]->cn_r_ctx.code, cn_r_data); \
+	} \
+	if (ALGO == cryptonight_r) \
+	{ \
+		al ^= cn_r_data[2] | ((uint64_t)(cn_r_data[3]) << 32); \
+		ah ^= cn_r_data[0] | ((uint64_t)(cn_r_data[1]) << 32); \
+	}
+
 #define CN_INIT_SINGLE \
 	if((ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) && len < 43) \
 	{ \
@@ -602,7 +694,7 @@ inline void set_float_rounding_mode()
 		return; \
 	}
 
-#define CN_INIT(n, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm) \
+#define CN_INIT(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data) \
 	keccak((const uint8_t *)input + len * n, len, ctx[n]->hash_state, 200); \
 	uint64_t monero_const; \
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
@@ -611,7 +703,7 @@ inline void set_float_rounding_mode()
 		monero_const ^=  *(reinterpret_cast<const uint64_t*>(ctx[n]->hash_state) + 24); \
 	} \
 	/* Optim - 99% time boundary */ \
-	cn_explode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state); \
+	cn_explode_scratchpad<SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->hash_state, (__m128i*)ctx[n]->long_state, algo); \
 	\
 	__m128i ax0; \
 	uint64_t idx0; \
@@ -620,7 +712,14 @@ inline void set_float_rounding_mode()
 	/* BEGIN cryptonight_monero_v8 variables */ \
 	__m128i bx1; \
 	__m128i division_result_xmm; \
+	__m128 conc_var; \
+	if(ALGO == cryptonight_conceal || ALGO == cryptonight_gpu) \
+	{\
+		set_float_rounding_mode_nearest(); \
+		conc_var = _mm_setzero_ps(); \
+	}\
 	GetOptimalSqrtType_t<N> sqrt_result; \
+	uint32_t cn_r_data[9]; \
 	/* END cryptonight_monero_v8 variables */ \
 	{ \
 		uint64_t* h0 = (uint64_t*)ctx[n]->hash_state; \
@@ -634,13 +733,23 @@ inline void set_float_rounding_mode()
 			assign(sqrt_result, h0[13]); \
 			set_float_rounding_mode(); \
 		} \
+		if (ALGO == cryptonight_r || ALGO == cryptonight_r_wow) \
+		{ \
+			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
+			cn_r_data[0] = (uint32_t)(h0[12]); \
+			cn_r_data[1] = (uint32_t)(h0[12] >> 32); \
+			cn_r_data[2] = (uint32_t)(h0[13]); \
+			cn_r_data[3] = (uint32_t)(h0[13] >> 32); \
+		} \
 	} \
 	__m128i *ptr0
 
-#define CN_STEP1(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1) \
+#define CN_STEP1(n, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1) \
 	__m128i cx; \
 	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
 	cx = _mm_load_si128(ptr0); \
+	if (ALGO == cryptonight_conceal) \
+		cryptonight_conceal_tweak(cx, conc_var); \
 	if (ALGO == cryptonight_bittube2) \
 	{ \
 		cx = aes_round_bittube2(cx, ax0); \
@@ -652,7 +761,7 @@ inline void set_float_rounding_mode()
 		else \
 			cx = _mm_aesenc_si128(cx, ax0); \
 	} \
-	CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1)
+	CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx)
 
 #define CN_STEP2(n, monero_const, l0, ax0, bx0, idx0, ptr0, cx) \
 	if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) \
@@ -664,24 +773,32 @@ inline void set_float_rounding_mode()
 	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
 	if(PREFETCH) \
 		_mm_prefetch((const char*)ptr0, _MM_HINT_T0); \
-	if(ALGO != cryptonight_monero_v8) \
+	if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_r && ALGO != cryptonight_r_wow) \
 		bx0 = cx
 
-#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \
+#define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data) \
 	uint64_t lo, cl, ch; \
 	uint64_t al0 = _mm_cvtsi128_si64(ax0); \
 	uint64_t ah0 = ((uint64_t*)&ax0)[1]; \
 	cl = ((uint64_t*)ptr0)[0]; \
 	ch = ((uint64_t*)ptr0)[1]; \
+	CN_R_RANDOM_MATH(n, al0, ah0, cl, bx0, bx1, cn_r_data); \
 	CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl); \
 	{ \
 		uint64_t hi; \
 		lo = _umul128(idx0, cl, &hi); \
-		CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \
+		if(ALGO == cryptonight_r) \
+		{ \
+			CN_MONERO_V8_SHUFFLE_0(n, l0, idx0, ax0, bx0, bx1, cx); \
+		} \
+		else \
+		{ \
+			CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi); \
+		} \
 		ah0 += lo; \
 		al0 += hi; \
 	} \
-	if(ALGO == cryptonight_monero_v8) \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_r || ALGO != cryptonight_r_wow) \
 	{ \
 		bx1 = bx0; \
 		bx0 = cx; \
@@ -729,7 +846,7 @@ inline void set_float_rounding_mode()
 
 #define CN_FINALIZE(n) \
 	/* Optim - 90% time boundary */ \
-	cn_implode_scratchpad<MEM, SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state); \
+	cn_implode_scratchpad<SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[n]->long_state, (__m128i*)ctx[n]->hash_state, algo); \
 	/* Optim - 99% time boundary */ \
 	keccakf((uint64_t*)ctx[n]->hash_state, 24); \
 	extra_hashes[ctx[n]->hash_state[0] & 3](ctx[n]->hash_state, 200, (char*)output + 32 * n)
@@ -771,6 +888,7 @@ inline void set_float_rounding_mode()
 #define CN_ENUM_13(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n
 #define CN_ENUM_14(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n
 #define CN_ENUM_15(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n
+#define CN_ENUM_16(n, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16) n, x1 ## n, x2 ## n, x3 ## n, x4 ## n, x5 ## n, x6 ## n, x7 ## n, x8 ## n, x9 ## n, x10 ## n, x11 ## n, x12 ## n, x13 ## n, x14 ## n, x15 ## n, x16 ## n
 
 /** repeat a macro call multiple times
  *
@@ -798,22 +916,22 @@ struct Cryptonight_hash<1>
 {
 	static constexpr size_t N = 1;
 
-	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MASK = cn_select_mask<ALGO>();
-		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-		constexpr size_t MEM = cn_select_memory<ALGO>();
+		const uint32_t MASK = algo.Mask();
+		const uint32_t ITERATIONS = algo.Iter();
+		const size_t MEM = algo.Mem();
 
 		CN_INIT_SINGLE;
-		REPEAT_1(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
+		REPEAT_1(11, CN_INIT, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			REPEAT_1(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_1(9, CN_STEP1, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_1(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_1(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_1(16, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data);
 			REPEAT_1(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_1(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -827,22 +945,22 @@ struct Cryptonight_hash<2>
 {
 	static constexpr size_t N = 2;
 
-	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MASK = cn_select_mask<ALGO>();
-		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-		constexpr size_t MEM = cn_select_memory<ALGO>();
+		const uint32_t MASK = algo.Mask();
+		const uint32_t ITERATIONS = algo.Iter();
+		const size_t MEM = algo.Mem();
 
 		CN_INIT_SINGLE;
-		REPEAT_2(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
+		REPEAT_2(11, CN_INIT, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			REPEAT_2(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_2(9, CN_STEP1, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_2(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_2(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_2(16, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data);
 			REPEAT_2(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_2(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -856,22 +974,22 @@ struct Cryptonight_hash<3>
 {
 	static constexpr size_t N = 3;
 
-	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MASK = cn_select_mask<ALGO>();
-		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-		constexpr size_t MEM = cn_select_memory<ALGO>();
+		const uint32_t MASK = algo.Mask();
+		const uint32_t ITERATIONS = algo.Iter();
+		const size_t MEM = algo.Mem();
 
 		CN_INIT_SINGLE;
-		REPEAT_3(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
+		REPEAT_3(11, CN_INIT, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			REPEAT_3(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_3(9, CN_STEP1, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_3(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_3(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_3(16, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data);
 			REPEAT_3(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_3(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -885,22 +1003,22 @@ struct Cryptonight_hash<4>
 {
 	static constexpr size_t N = 4;
 
-	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MASK = cn_select_mask<ALGO>();
-		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-		constexpr size_t MEM = cn_select_memory<ALGO>();
+		const uint32_t MASK = algo.Mask();
+		const uint32_t ITERATIONS = algo.Iter();
+		const size_t MEM = algo.Mem();
 
 		CN_INIT_SINGLE;
-		REPEAT_4(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
+		REPEAT_4(11, CN_INIT, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			REPEAT_4(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_4(9, CN_STEP1, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_4(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_4(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_4(16, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data);
 			REPEAT_4(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_4(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -914,22 +1032,22 @@ struct Cryptonight_hash<5>
 {
 	static constexpr size_t N = 5;
 
-	template<xmrstak_algo ALGO, bool SOFT_AES, bool PREFETCH>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MASK = cn_select_mask<ALGO>();
-		constexpr size_t ITERATIONS = cn_select_iter<ALGO>();
-		constexpr size_t MEM = cn_select_memory<ALGO>();
+		const uint32_t MASK = algo.Mask();
+		const uint32_t ITERATIONS = algo.Iter();
+		const size_t MEM = algo.Mem();
 
 		CN_INIT_SINGLE;
-		REPEAT_5(9, CN_INIT, monero_const, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm);
+		REPEAT_5(11, CN_INIT, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, bx1, sqrt_result, division_result_xmm, cn_r_data);
 
 		// Optim - 90% time boundary
 		for(size_t i = 0; i < ITERATIONS; i++)
 		{
-			REPEAT_5(8, CN_STEP1, monero_const, l0, ax0, bx0, idx0, ptr0, cx, bx1);
+			REPEAT_5(9, CN_STEP1, monero_const, conc_var, l0, ax0, bx0, idx0, ptr0, cx, bx1);
 			REPEAT_5(7, CN_STEP2, monero_const, l0, ax0, bx0, idx0, ptr0, cx);
-			REPEAT_5(15, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm);
+			REPEAT_5(16, CN_STEP3, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm, cn_r_data);
 			REPEAT_5(11, CN_STEP4, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0);
 			REPEAT_5(6, CN_STEP5, monero_const, l0, ax0, bx0, idx0, ptr0);
 		}
@@ -951,20 +1069,18 @@ struct Cryptonight_hash_asm<1, asm_version>
 {
 	static constexpr size_t N = 1;
 
-	template<xmrstak_algo ALGO>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MEM = cn_select_memory<ALGO>();
-
 		keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
-		cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state);
+		cn_explode_scratchpad<false, false, ALGO>((__m128i*)ctx[0]->hash_state, (__m128i*)ctx[0]->long_state, algo);
 
 		if(asm_version == 0)
 			cryptonight_v8_mainloop_ivybridge_asm(ctx[0]);
 		else if(asm_version == 1)
 			cryptonight_v8_mainloop_ryzen_asm(ctx[0]);
 
-		cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state);
+		cn_implode_scratchpad<false, false, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state, algo);
 		keccakf((uint64_t*)ctx[0]->hash_state, 24);
 		extra_hashes[ctx[0]->hash_state[0] & 3](ctx[0]->hash_state, 200, (char*)output);
 	}
@@ -976,16 +1092,16 @@ struct Cryptonight_hash_asm<2, 0>
 {
 	static constexpr size_t N = 2;
 
-	template<xmrstak_algo ALGO>
-	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx)
+	template<xmrstak_algo_id ALGO>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
 	{
-		constexpr size_t MEM = cn_select_memory<ALGO>();
+		const size_t MEM = algo.Mem();
 
 		for(size_t i = 0; i < N; ++i)
 		{
 			keccak((const uint8_t *)input + len * i, len, ctx[i]->hash_state, 200);
 			/* Optim - 99% time boundary */
-			cn_explode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state);
+			cn_explode_scratchpad<false, false, ALGO>((__m128i*)ctx[i]->hash_state, (__m128i*)ctx[i]->long_state, algo);
 		}
 
 		cryptonight_v8_double_mainloop_sandybridge_asm(ctx[0], ctx[1]);
@@ -993,10 +1109,48 @@ struct Cryptonight_hash_asm<2, 0>
 		for(size_t i = 0; i < N; ++i)
 		{
 			/* Optim - 90% time boundary */
-			cn_implode_scratchpad<MEM, false, false, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state);
+			cn_implode_scratchpad<false, false, ALGO>((__m128i*)ctx[i]->long_state, (__m128i*)ctx[i]->hash_state, algo);
 			/* Optim - 99% time boundary */
 			keccakf((uint64_t*)ctx[i]->hash_state, 24);
 			extra_hashes[ctx[i]->hash_state[0] & 3](ctx[i]->hash_state, 200, (char*)output + 32 * i);
 		}
 	}
 };
+
+struct Cryptonight_hash_gpu
+{
+	static constexpr size_t N = 1;
+
+	template<xmrstak_algo_id ALGO, bool SOFT_AES, bool PREFETCH>
+	static void hash(const void* input, size_t len, void* output, cryptonight_ctx** ctx, const xmrstak_algo& algo)
+	{
+		keccak((const uint8_t *)input, len, ctx[0]->hash_state, 200);
+		cn_explode_scratchpad_gpu<PREFETCH, ALGO>(ctx[0]->hash_state, ctx[0]->long_state, algo);
+
+		if(cngpu_check_avx2())
+			cn_gpu_inner_avx(ctx[0]->hash_state, ctx[0]->long_state, algo);
+		else
+			cn_gpu_inner_ssse3(ctx[0]->hash_state, ctx[0]->long_state, algo);
+
+		cn_implode_scratchpad<SOFT_AES, PREFETCH, ALGO>((__m128i*)ctx[0]->long_state, (__m128i*)ctx[0]->hash_state, algo);
+		keccakf((uint64_t*)ctx[0]->hash_state, 24);
+		memcpy(output, ctx[0]->hash_state, 32);
+	}
+};
+
+template<size_t N>
+struct Cryptonight_R_generator
+{
+	template<xmrstak_algo_id ALGO>
+	static void cn_on_new_job(const xmrstak::miner_work& work, cryptonight_ctx** ctx)
+	{
+		if(ctx[0]->cn_r_ctx.height == work.iBlockHeight)
+			return;
+
+		ctx[0]->cn_r_ctx.height = work.iBlockHeight;
+		v4_random_math_init<ALGO>(ctx[0]->cn_r_ctx.code, work.iBlockHeight);
+
+		for(size_t i=1; i < N; i++)
+			ctx[i]->cn_r_ctx = ctx[0]->cn_r_ctx;
+	}
+};
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
index a7e4696a8..a065abe01 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
+++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
@@ -203,10 +203,13 @@ size_t cryptonight_init(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg)
 
 cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, alloc_msg* msg)
 {
-	size_t hashMemSize = std::max(
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-	);
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+	size_t hashMemSize = 0;
+	for(const auto algo : neededAlgorithms)
+	{
+		hashMemSize = std::max(hashMemSize, algo.Mem());
+	}
 
 	cryptonight_ctx* ptr = (cryptonight_ctx*)_mm_malloc(sizeof(cryptonight_ctx), 4096);
 
@@ -284,10 +287,13 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 
 void cryptonight_free_ctx(cryptonight_ctx* ctx)
 {
-	size_t hashMemSize = std::max(
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-	);
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+	size_t hashMemSize = 0;
+	for(const auto algo : neededAlgorithms)
+	{
+		hashMemSize = std::max(hashMemSize, algo.Mem());
+	}
 
 	if(ctx->ctx_info[0] != 0)
 	{
diff --git a/xmrstak/backend/cpu/crypto/variant4_random_math.h b/xmrstak/backend/cpu/crypto/variant4_random_math.h
new file mode 100644
index 000000000..07dd3cf61
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/variant4_random_math.h
@@ -0,0 +1,451 @@
+#pragma once
+
+#include <string.h>
+#include "../../cryptonight.hpp"
+
+extern "C"
+{
+    #include "c_blake256.h"
+}
+
+enum V4_Settings
+{
+	// Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications
+	TOTAL_LATENCY = 15 * 3,
+
+	// Always generate at least 60 instructions
+	NUM_INSTRUCTIONS_MIN = 60,
+
+	// Never generate more than 70 instructions (final RET instruction doesn't count here)
+	NUM_INSTRUCTIONS_MAX = 70,
+
+	// Available ALUs for MUL
+	// Modern CPUs typically have only 1 ALU which can do multiplications
+	ALU_COUNT_MUL = 1,
+
+	// Total available ALUs
+	// Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code
+	ALU_COUNT = 3,
+};
+
+enum V4_InstructionList
+{
+	MUL,	// a*b
+	ADD,	// a+b + C, C is an unsigned 32-bit constant
+	SUB,	// a-b
+	ROR,	// rotate right "a" by "b & 31" bits
+	ROL,	// rotate left "a" by "b & 31" bits
+	XOR,	// a^b
+	RET,	// finish execution
+	V4_INSTRUCTION_COUNT = RET,
+};
+
+// V4_InstructionDefinition is used to generate code from random data
+// Every random sequence of bytes is a valid code
+//
+// There are 9 registers in total:
+// - 4 variable registers
+// - 5 constant registers initialized from loop variables
+// This is why dst_index is 2 bits
+enum V4_InstructionDefinition
+{
+	V4_OPCODE_BITS = 3,
+	V4_DST_INDEX_BITS = 2,
+	V4_SRC_INDEX_BITS = 3,
+};
+
+struct V4_Instruction
+{
+	uint8_t opcode;
+	uint8_t dst_index;
+	uint8_t src_index;
+	uint32_t C;
+};
+
+#ifndef FORCEINLINE
+#ifdef __GNUC__
+#define FORCEINLINE __attribute__((always_inline)) inline
+#elif _MSC_VER
+#define FORCEINLINE __forceinline
+#else
+#define FORCEINLINE inline
+#endif
+#endif
+
+#ifndef UNREACHABLE_CODE
+#ifdef __GNUC__
+#define UNREACHABLE_CODE __builtin_unreachable()
+#elif _MSC_VER
+#define UNREACHABLE_CODE __assume(false)
+#else
+#define UNREACHABLE_CODE
+#endif
+#endif
+
+// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU:
+// every switch-case will point to the same destination on every iteration of Cryptonight main loop
+//
+// This is about as fast as it can get without using low-level machine code generation
+template<typename v4_reg>
+static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
+{
+	enum
+	{
+		REG_BITS = sizeof(v4_reg) * 8,
+	};
+
+#define V4_EXEC(i) \
+	{ \
+		const struct V4_Instruction* op = code + i; \
+		const v4_reg src = r[op->src_index]; \
+		v4_reg* dst = r + op->dst_index; \
+		switch (op->opcode) \
+		{ \
+		case MUL: \
+			*dst *= src; \
+			break; \
+		case ADD: \
+			*dst += src + op->C; \
+			break; \
+		case SUB: \
+			*dst -= src; \
+			break; \
+		case ROR: \
+			{ \
+				const uint32_t shift = src % REG_BITS; \
+				*dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \
+			} \
+			break; \
+		case ROL: \
+			{ \
+				const uint32_t shift = src % REG_BITS; \
+				*dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \
+			} \
+			break; \
+		case XOR: \
+			*dst ^= src; \
+			break; \
+		case RET: \
+			return; \
+		default: \
+			UNREACHABLE_CODE; \
+			break; \
+		} \
+	}
+
+#define V4_EXEC_10(j) \
+	V4_EXEC(j + 0) \
+	V4_EXEC(j + 1) \
+	V4_EXEC(j + 2) \
+	V4_EXEC(j + 3) \
+	V4_EXEC(j + 4) \
+	V4_EXEC(j + 5) \
+	V4_EXEC(j + 6) \
+	V4_EXEC(j + 7) \
+	V4_EXEC(j + 8) \
+	V4_EXEC(j + 9)
+
+	// Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
+	// I've checked all block heights < 10,000,000 and here is the distribution of program sizes:
+	//
+	// 60      27960
+	// 61      105054
+	// 62      2452759
+	// 63      5115997
+	// 64      1022269
+	// 65      1109635
+	// 66      153145
+	// 67      8550
+	// 68      4529
+	// 69      102
+
+	// Unroll 70 instructions here
+	V4_EXEC_10(0);		// instructions 0-9
+	V4_EXEC_10(10);		// instructions 10-19
+	V4_EXEC_10(20);		// instructions 20-29
+	V4_EXEC_10(30);		// instructions 30-39
+	V4_EXEC_10(40);		// instructions 40-49
+	V4_EXEC_10(50);		// instructions 50-59
+	V4_EXEC_10(60);		// instructions 60-69
+
+#undef V4_EXEC_10
+#undef V4_EXEC
+}
+
+// If we don't have enough data available, generate more
+static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size)
+{
+	if (*data_index + bytes_needed > data_size)
+	{
+		blake256_hash((uint8_t*)data, (uint8_t*)data, data_size);
+		*data_index = 0;
+	}
+}
+
+#define SWAP32LE(x) x
+#define SWAP64LE(x) x
+
+// Generates as many random math operations as possible with given latency and ALU restrictions
+// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions
+template<xmrstak_algo_id ALGO>
+static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height)
+{
+	// MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle
+	// These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake
+	//
+	// AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors
+	// Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors
+	// AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same
+	// Source: https://www.agner.org/optimize/instruction_tables.pdf
+	const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 };
+
+	// Instruction latencies for theoretical ASIC implementation
+	const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 };
+
+	// Available ALUs for each instruction
+	const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT };
+
+	int8_t data[32];
+	memset(data, 0, sizeof(data));
+	uint64_t tmp = SWAP64LE(height);
+	memcpy(data, &tmp, sizeof(uint64_t));
+	if(ALGO == cryptonight_r)
+	{
+		data[20] = -38;
+	}
+
+	// Set data_index past the last byte in data
+	// to trigger full data update with blake hash
+	// before we start using it
+	size_t data_index = sizeof(data);
+
+	int code_size;
+
+	// There is a small chance (1.8%) that register R8 won't be used in the generated program
+	// So we keep track of it and try again if it's not used
+	bool r8_used;
+	do {
+		int latency[9];
+		int asic_latency[9];
+
+		// Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution
+		// byte 0: current value of the destination register
+		// byte 1: instruction opcode
+		// byte 2: current value of the source register
+		//
+		// Registers R4-R8 are constant and are treated as having the same value because when we do
+		// the same operation twice with two constant source registers, it can be optimized into a single operation
+		uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
+
+		bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT];
+		bool is_rotation[V4_INSTRUCTION_COUNT];
+		bool rotated[4];
+		int rotate_count = 0;
+
+		memset(latency, 0, sizeof(latency));
+		memset(asic_latency, 0, sizeof(asic_latency));
+		memset(alu_busy, 0, sizeof(alu_busy));
+		memset(is_rotation, 0, sizeof(is_rotation));
+		memset(rotated, 0, sizeof(rotated));
+		is_rotation[ROR] = true;
+		is_rotation[ROL] = true;
+
+		int num_retries = 0;
+		code_size = 0;
+
+		int total_iterations = 0;
+		r8_used = (ALGO == cryptonight_r_wow);
+
+		// Generate random code to achieve minimal required latency for our abstract CPU
+		// Try to get this latency for all 4 registers
+		while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64))
+		{
+			// Fail-safe to guarantee loop termination
+			++total_iterations;
+			if (total_iterations > 256)
+				break;
+
+			check_data(&data_index, 1, data, sizeof(data));
+
+			const uint8_t c = ((uint8_t*)data)[data_index++];
+
+			// MUL = opcodes 0-2
+			// ADD = opcode 3
+			// SUB = opcode 4
+			// ROR/ROL = opcode 5, shift direction is selected randomly
+			// XOR = opcodes 6-7
+			uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1);
+			if (opcode == 5)
+			{
+				check_data(&data_index, 1, data, sizeof(data));
+				opcode = (data[data_index++] >= 0) ? ROR : ROL;
+			}
+			else if (opcode >= 6)
+			{
+				opcode = XOR;
+			}
+			else
+			{
+				opcode = (opcode <= 2) ? MUL : (opcode - 2);
+			}
+
+			uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1);
+			uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1);
+
+			const int a = dst_index;
+			int b = src_index;
+
+			// Don't do ADD/SUB/XOR with the same register
+			if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
+			{
+				// a is always < 4, so we don't need to check bounds here
+				b = (ALGO == cryptonight_r_wow) ? (a + 4) : 8;
+				src_index = b;
+			}
+
+			// Don't do rotation with the same destination twice because it's equal to a single rotation
+			if (is_rotation[opcode] && rotated[a])
+			{
+				continue;
+			}
+
+			// Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized:
+			// 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations
+			// 2xXOR(a, b) = NOP
+			if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16)))
+			{
+				continue;
+			}
+
+			// Find which ALU is available (and when) for this instruction
+			int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b];
+			int alu_index = -1;
+			while (next_latency < TOTAL_LATENCY)
+			{
+				for (int i = op_ALUs[opcode] - 1; i >= 0; --i)
+				{
+					if (!alu_busy[next_latency][i])
+					{
+						// ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check
+						if ((opcode == ADD) && alu_busy[next_latency + 1][i])
+						{
+							continue;
+						}
+
+						// Rotation can only start when previous rotation is finished, so do an additional availability check
+						if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode]))
+						{
+							continue;
+						}
+
+						alu_index = i;
+						break;
+					}
+				}
+				if (alu_index >= 0)
+				{
+					break;
+				}
+				++next_latency;
+			}
+
+			// Don't generate instructions that leave some register unchanged for more than 7 cycles
+			if (next_latency > latency[a] + 7)
+			{
+				continue;
+			}
+
+			next_latency += op_latency[opcode];
+
+			if (next_latency <= TOTAL_LATENCY)
+			{
+				if (is_rotation[opcode])
+				{
+					++rotate_count;
+				}
+
+				// Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined
+				alu_busy[next_latency - op_latency[opcode]][alu_index] = true;
+				latency[a] = next_latency;
+
+				// ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple
+				asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode];
+
+				rotated[a] = is_rotation[opcode];
+
+				inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16);
+
+				code[code_size].opcode = opcode;
+				code[code_size].dst_index = dst_index;
+				code[code_size].src_index = src_index;
+				code[code_size].C = 0;
+
+				if (src_index == 8)
+				{
+					r8_used = true;
+				}
+
+				if (opcode == ADD)
+				{
+					// ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
+					alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true;
+
+					// ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C"
+					check_data(&data_index, sizeof(uint32_t), data, sizeof(data));
+					uint32_t t;
+					memcpy(&t, data + data_index, sizeof(uint32_t));
+					code[code_size].C = SWAP32LE(t);
+					data_index += sizeof(uint32_t);
+				}
+
+				++code_size;
+				if (code_size >= NUM_INSTRUCTIONS_MIN)
+				{
+					break;
+				}
+			}
+			else
+			{
+				++num_retries;
+			}
+		}
+
+		// ASIC has more execution resources and can extract as much parallelism from the code as possible
+		// We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
+		// Get this latency for at least 1 of the 4 registers
+		const int prev_code_size = code_size;
+		while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
+		{
+			int min_idx = 0;
+			int max_idx = 0;
+			for (int i = 1; i < 4; ++i)
+			{
+				if (asic_latency[i] < asic_latency[min_idx]) min_idx = i;
+				if (asic_latency[i] > asic_latency[max_idx]) max_idx = i;
+			}
+
+			const uint8_t pattern[3] = { ROR, MUL, MUL };
+			const uint8_t opcode = pattern[(code_size - prev_code_size) % 3];
+			latency[min_idx] = latency[max_idx] + op_latency[opcode];
+			asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode];
+
+			code[code_size].opcode = opcode;
+			code[code_size].dst_index = min_idx;
+			code[code_size].src_index = max_idx;
+			code[code_size].C = 0;
+			++code_size;
+		}
+
+	// There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
+	// It never does more than 4 iterations for all block heights < 10,000,000
+	}  while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
+
+	// It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here
+	// Add final instruction to stop the interpreter
+	code[code_size].opcode = RET;
+	code[code_size].dst_index = 0;
+	code[code_size].src_index = 0;
+	code[code_size].C = 0;
+
+	return code_size;
+}
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 20203a3c5..064b07339 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -50,6 +50,7 @@
 #include <cstring>
 #include <thread>
 #include <bitset>
+#include <unordered_map>
 
 #ifdef _WIN32
 #include <windows.h>
@@ -239,169 +240,208 @@ bool minethd::self_test()
 	cn_hash_fun hashf;
 	cn_hash_fun hashf_multi;
 
-	xmrstak_algo algo = xmrstak_algo::invalid_algo;
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
 
-	for(int algo_idx = 0; algo_idx < 2; ++algo_idx)
+	for(const auto algo : neededAlgorithms)
 	{
-		if(algo_idx == 0)
-			algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo();
-		else
-			algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
-
-		if(algo == cryptonight)
+		if(algo == POW(cryptonight))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-			hashf("This is a test", 14, out, ctx);
+			std::cout<<algo.Name()<< " test cn" <<std::endl;
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test", 14, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
-			hashf("This is a test", 14, out, ctx);
+			minethd::cn_on_new_job dm;
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test", 14, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0;
 
-			hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
+			func_multi_selector<2>(hashf_multi, dm, ::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
 					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
 
-			hashf_multi = func_multi_selector<2>(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight);
-			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx);
+			func_multi_selector<2>(hashf_multi, dm, ::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf_multi("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59"
 					"\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0;
 
-			hashf_multi = func_multi_selector<3>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-			hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx);
+			func_multi_selector<3>(hashf_multi, dm, ::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf_multi("This is a testThis is a testThis is a test", 14, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 96) == 0;
 
-			hashf_multi = func_multi_selector<4>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-			hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+			func_multi_selector<4>(hashf_multi, dm, ::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf_multi("This is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 128) == 0;
 
-			hashf_multi = func_multi_selector<5>(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight);
-			hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx);
+			func_multi_selector<5>(hashf_multi, dm, ::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf_multi("This is a testThis is a testThis is a testThis is a testThis is a test", 14, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05"
 					"\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 160) == 0;
 		}
-		else if(algo == cryptonight_lite)
+		else if(algo == POW(cryptonight_lite))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_lite);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_lite);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x5a\x24\xa0\x29\xde\x1c\x39\x3f\x3d\x52\x7a\x2f\x9b\x39\xdc\x3d\xb3\xbc\x87\x11\x8b\x84\x52\x9b\x9f\x0\x88\x49\x25\x4b\x5\xce", 32) == 0;
 		}
-		else if(algo == cryptonight_monero)
+		else if(algo == POW(cryptonight_monero))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x1\x57\xc5\xee\x18\x8b\xbe\xc8\x97\x52\x85\xa3\x6\x4e\xe9\x20\x65\x21\x76\x72\xfd\x69\xa1\xae\xbd\x7\x66\xc7\xb5\x6e\xe0\xbd", 32) == 0;
 		}
-		else if(algo == cryptonight_monero_v8)
+		else if(algo == POW(cryptonight_monero_v8))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_monero_v8);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_monero_v8);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult &= memcmp(out, "\x35\x3f\xdc\x06\x8f\xd4\x7b\x03\xc0\x4b\x94\x31\xe0\x05\xe0\x0b\x68\xc2\x16\x8a\x3c\xc7\x33\x5c\x8b\x9b\x30\x81\x56\x59\x1a\x4f", 32) == 0;
 		}
-		else if(algo == cryptonight_aeon)
+		else if(algo == POW(cryptonight_aeon))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_aeon);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_aeon);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xfc\xa1\x7d\x44\x37\x70\x9b\x4a\x3b\xd7\x1e\xf3\xed\x21\xb4\x17\xca\x93\xdc\x86\x79\xce\x81\xdf\xd3\xcb\xdd\xa\x22\xd7\x58\xba", 32) == 0;
 		}
-		else if(algo == cryptonight_ipbc)
+		else if(algo == POW(cryptonight_ipbc))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_ipbc);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0xb0", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_ipbc);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xbc\xe7\x48\xaf\xc5\x31\xff\xc9\x33\x7f\xcf\x51\x1b\xe3\x20\xa3\xaa\x8d\x4\x55\xf9\x14\x2a\x61\xe8\x38\xdf\xdc\x3b\x28\x3e\x0", 32) == 0;
 		}
-		else if(algo == cryptonight_stellite)
+		else if(algo == POW(cryptonight_stellite))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_stellite);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_stellite);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xb9\x9d\x6c\xee\x50\x3c\x6f\xa6\x3f\x30\x69\x24\x4a\x0\x9f\xe4\xd4\x69\x3f\x68\x92\xa4\x5c\xc2\x51\xae\x46\x87\x7c\x6b\x98\xae", 32) == 0;
 		}
-		else if(algo == cryptonight_masari)
+		else if(algo == POW(cryptonight_masari))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_masari);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_masari);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xbf\x5f\xd\xf3\x5a\x65\x7c\x89\xb0\x41\xcf\xf0\xd\x46\x6a\xb6\x30\xf9\x77\x7f\xd9\xc6\x3\xd7\x3b\xd8\xf1\xb5\x4b\x49\xed\x28", 32) == 0;
 		}
-		else if(algo == cryptonight_heavy)
+		else if(algo == POW(cryptonight_heavy))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_heavy);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_heavy);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xf9\x44\x97\xce\xb4\xf0\xd9\x84\xb\x9b\xfc\x45\x94\x74\x55\x25\xcf\x26\x83\x16\x4f\xc\xf8\x2d\xf5\xf\x25\xff\x45\x28\x2e\x85", 32) == 0;
 		}
-		else if(algo == cryptonight_haven)
+		else if(algo == POW(cryptonight_haven))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_haven);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, xmrstak_algo::cryptonight_haven);
-			hashf("This is a test This is a test This is a test", 44, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\xc7\xd4\x52\x9\x2b\x48\xa5\xaf\xae\x11\xaf\x40\x9a\x87\xe5\x88\xf0\x29\x35\xa3\x68\xd\xe3\x6b\xce\x43\xf6\xc8\xdf\xd3\xe3\x9", 32) == 0;
 		}
-		else if(algo == cryptonight_bittube2)
+		else if(algo == POW(cryptonight_bittube2))
 		{
 			unsigned char out[32 * MAX_N];
 			cn_hash_fun hashf;
 
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_bittube2);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
 
-			hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx);
+			hashf("\x38\x27\x4c\x97\xc4\x5a\x17\x2c\xfc\x97\x67\x98\x70\x42\x2e\x3a\x1a\xb0\x78\x49\x60\xc6\x05\x14\xd8\x16\x27\x14\x15\xc3\x06\xee\x3a\x3e\xd1\xa7\x7e\x31\xf6\xa8\x85\xc3\xcb\xff\x01\x02\x03\x04", 48, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x18\x2c\x30\x41\x93\x1a\x14\x73\xc6\xbf\x7e\x77\xfe\xb5\x17\x9b\xa8\xbe\xa9\x68\xba\x9e\xe1\xe8\x24\x1a\x12\x7a\xac\x81\xb4\x24", 32) == 0;
 
-			hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx);
+			hashf("\x04\x04\xb4\x94\xce\xd9\x05\x18\xe7\x25\x5d\x01\x28\x63\xde\x8a\x4d\x27\x72\xb1\xff\x78\x8c\xd0\x56\x20\x38\x98\x3e\xd6\x8c\x94\xea\x00\xfe\x43\x66\x68\x83\x00\x00\x00\x00\x18\x7c\x2e\x0f\x66\xf5\x6b\xb9\xef\x67\xed\x35\x14\x5c\x69\xd4\x69\x0d\x1f\x98\x22\x44\x01\x2b\xea\x69\x6e\xe8\xb3\x3c\x42\x12\x01", 76, out, ctx, algo);
 			bResult = bResult && memcmp(out, "\x7f\xbe\xb9\x92\x76\x87\x5a\x3c\x43\xc2\xbe\x5a\x73\x36\x06\xb5\xdc\x79\xcc\x9c\xf3\x7c\x43\x3e\xb4\x18\x56\x17\xfb\x9b\xc9\x36", 32) == 0;
 
-			hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx);
+			hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx, algo);
 			bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0;
 		}
-    else if(algo == cryptonight_superfast)
+		else if(algo == POW(cryptonight_superfast))
 		{
-			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_superfast);
-			hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx);
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx, algo);
 			bResult = bResult &&  memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0;
 		}
+		else if(algo == POW(cryptonight_gpu))
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("", 0, out, ctx, algo);
+			bResult = bResult &&  memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("", 0, out, ctx, algo);
+			bResult = bResult &&  memcmp(out, "\x55\x5e\x0a\xee\x78\x79\x31\x6d\x7d\xef\xf7\x72\x97\x3c\xb9\x11\x8e\x38\x95\x70\x9d\xb2\x54\x7a\xc0\x72\xd5\xb9\x13\x10\x01\xd8", 32) == 0;
+		}
+		else if(algo == POW(cryptonight_conceal))
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("", 0, out, ctx, algo);
+			bResult = bResult &&  memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("", 0, out, ctx, algo);
+			bResult = bResult &&  memcmp(out, "\xb5\x54\x4b\x58\x16\x70\x26\x47\x63\x47\xe4\x1f\xb6\x5e\x57\xc9\x7c\xa5\x93\xfe\x0e\xb1\x0f\xb9\x2f\xa7\x3e\x5b\xae\xef\x79\x8c", 32) == 0;
+		}
+		else if (algo == POW(cryptonight_turtle))
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
+			bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0;
 
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
+			bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0;
+		}
+		else if(algo == POW(cryptonight_r))
+		{
+			minethd::cn_on_new_job set_job;
+			func_multi_selector<1>(hashf, set_job, ::jconf::inst()->HaveHardwareAes(), false, algo);
+			miner_work work;
+			work.iBlockHeight = 1806260;
+			set_job(work, ctx);
+			hashf("\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74", 44, out, ctx, algo);
+			bResult = bResult &&  memcmp(out, "\xf7\x59\x58\x8a\xd5\x7e\x75\x84\x67\x29\x54\x43\xa9\xbd\x71\x49\x0a\xbf\xf8\xe9\xda\xd1\xb9\x5b\x6b\xf2\xf5\xd0\xd7\x83\x87\xbc", 32) == 0;
+		}
+		else
+			printer::inst()->print_msg(L0,
+				"Cryptonight hash self-test NOT defined for POW %s", algo.Name().c_str());
 
 		if(!bResult)
 			printer::inst()->print_msg(L0,
@@ -483,7 +523,8 @@ static std::string getAsmName(const uint32_t num_hashes)
 }
 
 template<size_t N>
-minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str)
+void minethd::func_multi_selector(minethd::cn_hash_fun& hash_fun, minethd::cn_on_new_job& on_new_job,
+	bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str)
 {
 	static_assert(N >= 1, "number of threads must be >= 1" );
 
@@ -492,7 +533,7 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	// function as a two digit binary
 
 	uint8_t algv;
-	switch(algo)
+	switch(algo.Id())
 	{
 	case cryptonight:
 		algv = 2;
@@ -530,6 +571,15 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	case cryptonight_superfast:
 		algv = 11;
 		break;
+	case cryptonight_gpu:
+		algv = 12;
+		break;
+	case cryptonight_conceal:
+		algv = 13;
+		break;
+	case cryptonight_r:
+		algv = 14;
+		break;
 	default:
 		algv = 2;
 		break;
@@ -590,22 +640,36 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, true>,
 		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, true>,
-    
+
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, false, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_superfast, false, true>,
-		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, true>
+		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, true>,
+
+		Cryptonight_hash_gpu::template hash<cryptonight_gpu, false, false>,
+		Cryptonight_hash_gpu::template hash<cryptonight_gpu, true, false>,
+		Cryptonight_hash_gpu::template hash<cryptonight_gpu, false, true>,
+		Cryptonight_hash_gpu::template hash<cryptonight_gpu, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_conceal, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_conceal, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_conceal, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_conceal, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_r, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_r, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_r, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_r, true, true>
 	};
 
 	std::bitset<2> digit;
 	digit.set(0, !bHaveAes);
 	digit.set(1, !bNoPrefetch);
 
-	auto selected_function = func_table[ algv << 2 | digit.to_ulong() ];
-
+	hash_fun = func_table[ algv << 2 | digit.to_ulong() ];
 
 	// check for asm optimized version for cryptonight_v8
-	if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes)
+	if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes && algo.Mem() == CN_MEMORY && algo.Iter() == CN_ITER)
 	{
 		std::string selected_asm = asm_version_str;
 		if(selected_asm == "auto")
@@ -617,15 +681,15 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 			{
 				// Intel Ivy Bridge (Xeon v2, Core i7/i5/i3 3xxx, Pentium G2xxx, Celeron G1xxx)
 				if(N == 1)
-					selected_function = Cryptonight_hash_asm<1u, 0u>::template hash<cryptonight_monero_v8>;
+					hash_fun = Cryptonight_hash_asm<1u, 0u>::template hash<cryptonight_monero_v8>;
 				else if(N == 2)
-					selected_function = Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_monero_v8>;
+					hash_fun = Cryptonight_hash_asm<2u, 0u>::template hash<cryptonight_monero_v8>;
 			}
 			// supports only 1 thread per hash
 			if(N == 1 && selected_asm == "amd_avx")
 			{
 				// AMD Ryzen (1xxx and 2xxx series)
-				selected_function = Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_monero_v8>;
+				hash_fun = Cryptonight_hash_asm<1u, 1u>::template hash<cryptonight_monero_v8>;
 			}
 			if(asm_version_str == "auto" && (selected_asm != "intel_avx" || selected_asm != "amd_avx"))
 				printer::inst()->print_msg(L3, "Switch to assembler version for '%s' cpu's", selected_asm.c_str());
@@ -633,13 +697,24 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 				printer::inst()->print_msg(L1, "Assembler '%s' unknown, fallback to non asm version of cryptonight_v8", selected_asm.c_str());
 		}
 	}
-	
-	return selected_function;
+
+	static const std::unordered_map<uint32_t, minethd::cn_on_new_job> on_new_job_map = {
+		{cryptonight_r, Cryptonight_R_generator<N>::template cn_on_new_job<cryptonight_r>},
+	};
+
+	auto it = on_new_job_map.find(algo.Id());
+	if (it != on_new_job_map.end())
+		on_new_job = it->second;
+	else
+		on_new_job = nullptr;
 }
 
-minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo)
+minethd::cn_hash_fun minethd::func_selector(bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo)
 {
-	return func_multi_selector<1>(bHaveAes, bNoPrefetch, algo);
+	minethd::cn_hash_fun fun;
+	minethd::cn_on_new_job dm;
+	func_multi_selector<1>(fun, dm, bHaveAes, bNoPrefetch, algo);
+	return fun;
 }
 
 void minethd::work_main()
@@ -719,10 +794,12 @@ void minethd::multiway_work_main()
 
 	// start with root algorithm and switch later if fork version is reached
 	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
-	cn_hash_fun hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
+	cn_hash_fun hash_fun_multi;
+	cn_on_new_job on_new_job;
 	uint8_t version = 0;
 	size_t lastPoolId = 0;
 
+	func_multi_selector<N>(hash_fun_multi, on_new_job, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 	while (bQuit == 0)
 	{
 		if (oWork.bStall)
@@ -754,17 +831,20 @@ void minethd::multiway_work_main()
 			if(new_version >= coinDesc.GetMiningForkVersion())
 			{
 				miner_algo = coinDesc.GetMiningAlgo();
-				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
+				func_multi_selector<N>(hash_fun_multi, on_new_job, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 			}
 			else
 			{
 				miner_algo = coinDesc.GetMiningAlgoRoot();
-				hash_fun_multi = func_multi_selector<N>(::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
+				func_multi_selector<N>(hash_fun_multi, on_new_job, ::jconf::inst()->HaveHardwareAes(), bNoPrefetch, miner_algo, asm_version_str);
 			}
 			lastPoolId = oWork.iPoolId;
 			version = new_version;
 		}
 
+		if(on_new_job != nullptr)
+			on_new_job(oWork, ctx);
+
 		while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo)
 		{
 			if ((iCount++ & 0x7) == 0)  //Store stats every 8*N hashes
@@ -787,7 +867,7 @@ void minethd::multiway_work_main()
 			for (size_t i = 0; i < N; i++)
 				*piNonce[i] = iNonce++;
 
-			hash_fun_multi(bWorkBlob, oWork.iWorkSize, bHashOut, ctx);
+			hash_fun_multi(bWorkBlob, oWork.iWorkSize, bHashOut, ctx, miner_algo);
 
 			for (size_t i = 0; i < N; i++)
 			{
diff --git a/xmrstak/backend/cpu/minethd.hpp b/xmrstak/backend/cpu/minethd.hpp
index eb77749f6..ca89e5b52 100644
--- a/xmrstak/backend/cpu/minethd.hpp
+++ b/xmrstak/backend/cpu/minethd.hpp
@@ -22,18 +22,20 @@ class minethd : public iBackend
 	static std::vector<iBackend*> thread_starter(uint32_t threadOffset, miner_work& pWork);
 	static bool self_test();
 
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
+	typedef void (*cn_on_new_job)(const miner_work&, cryptonight_ctx**);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&);
 
-	static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo);
+	static cn_hash_fun func_selector(bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo);
 	static bool thd_setaffinity(std::thread::native_handle_type h, uint64_t cpu_id);
 
 	static cryptonight_ctx* minethd_alloc_ctx();
 
-private:
-
 	template<size_t N>
-	static cn_hash_fun func_multi_selector(bool bHaveAes, bool bNoPrefetch, xmrstak_algo algo, const std::string& asm_version_str = "off");
+	static void func_multi_selector(minethd::cn_hash_fun& hash_fun, minethd::cn_on_new_job& on_new_job,
+			bool bHaveAes, bool bNoPrefetch, const xmrstak_algo& algo, const std::string& asm_version_str = "off");
 
+	private:
+		
 	minethd(miner_work& pWork, size_t iNo, int iMultiway, bool no_prefetch, int64_t affinity, const std::string& asm_version);
 
 	template<uint32_t N>
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index e905caa9f..00311bb93 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -2,8 +2,12 @@
 #include <stddef.h>
 #include <inttypes.h>
 #include <type_traits>
+#include <string>
+#include <array>
 
-enum xmrstak_algo
+constexpr size_t start_derived_algo_id = 1000;
+
+enum xmrstak_algo_id
 {
 	invalid_algo = 0,
 	cryptonight = 1,
@@ -17,213 +21,193 @@ enum xmrstak_algo
 	cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak
 	cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
 	cryptonight_monero_v8 = 11,
-	cryptonight_superfast = 12
+	cryptonight_superfast = 12,
+	cryptonight_gpu = 13,
+	cryptonight_conceal = 14,
+	cryptonight_r_wow = 15,
+	cryptonight_r = 16,
+
+	cryptonight_turtle = start_derived_algo_id,
+	cryptonight_v8_half = (start_derived_algo_id + 1),
+	cryptonight_v8_zelerius = (start_derived_algo_id + 2)
+	// please add the algorithm name to get_algo_name()
 };
 
-// define aeon settings
-constexpr size_t CRYPTONIGHT_LITE_MEMORY = 1 * 1024 * 1024;
-constexpr uint32_t CRYPTONIGHT_LITE_MASK = 0xFFFF0;
-constexpr uint32_t CRYPTONIGHT_LITE_ITER = 0x40000;
-
-constexpr size_t CRYPTONIGHT_MEMORY = 2 * 1024 * 1024;
-constexpr uint32_t CRYPTONIGHT_MASK = 0x1FFFF0;
-constexpr uint32_t CRYPTONIGHT_ITER = 0x80000;
-
-constexpr size_t CRYPTONIGHT_HEAVY_MEMORY = 4 * 1024 * 1024;
-constexpr uint32_t CRYPTONIGHT_HEAVY_MASK = 0x3FFFF0;
-constexpr uint32_t CRYPTONIGHT_HEAVY_ITER = 0x40000;
-
-constexpr uint32_t CRYPTONIGHT_MASARI_ITER = 0x40000;
-
-constexpr uint32_t CRYPTONIGHT_SUPERFAST_ITER = 0x20000; 
-
-template<xmrstak_algo ALGO>
-inline constexpr size_t cn_select_memory() { return 0; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight>() { return CRYPTONIGHT_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_lite>() { return CRYPTONIGHT_LITE_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_monero>() { return CRYPTONIGHT_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_monero_v8>() { return CRYPTONIGHT_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_aeon>() { return CRYPTONIGHT_LITE_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_ipbc>() { return CRYPTONIGHT_LITE_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_stellite>() { return CRYPTONIGHT_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_masari>() { return CRYPTONIGHT_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_haven>() { return CRYPTONIGHT_HEAVY_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_MEMORY; }
-
-template<>
-inline constexpr size_t cn_select_memory<cryptonight_superfast>() { return CRYPTONIGHT_MEMORY; } 
+/** get name of the algorithm
+ *
+ * @param algo mining algorithm
+ */
+inline std::string get_algo_name(xmrstak_algo_id algo_id)
+{
+	static std::array<std::string, 17> base_algo_names =
+	{{
+		"invalid_algo",
+		"cryptonight",
+		"cryptonight_lite",
+		"cryptonight_v7",
+		"cryptonight_heavy",
+		"cryptonight_lite_v7",
+		"cryptonight_lite_v7_xor",
+		"cryptonight_v7_stellite",
+		"cryptonight_masari",
+		"cryptonight_haven",
+		"cryptonight_bittube2",
+		"cryptonight_v8",
+		"cryptonight_superfast",
+		"cryptonight_gpu",
+		"cryptonight_conceal",
+		"cryptonight_r_wow",
+		"cryptonight_r"
+	}};
+
+	static std::array<std::string, 3> derived_algo_names =
+	{{
+		"cryptonight_turtle",
+		"cryptonight_v8_half", // used by masari and stellite
+		"cryptonight_v8_zelerius"
+	}};
+
+
+	if(algo_id < start_derived_algo_id)
+		return base_algo_names[algo_id];
+	else
+		return derived_algo_names[algo_id - start_derived_algo_id];
+}
 
-inline size_t cn_select_memory(xmrstak_algo algo)
+struct xmrstak_algo
 {
-	switch(algo)
+	xmrstak_algo(xmrstak_algo_id name_id) : algo_name(name_id), base_algo(name_id)
+	{
+	}
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm) : algo_name(name_id), base_algo(algorithm)
+	{
+	}
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration) : algo_name(name_id), base_algo(algorithm), iter(iteration)
+	{
+	}
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory)
+	{
+	}
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory, uint32_t mem_mask) : algo_name(name_id), base_algo(algorithm), iter(iteration), mem(memory), mask(mem_mask)
 	{
-	case cryptonight_stellite:
-	case cryptonight_monero:
-	case cryptonight_monero_v8:
-	case cryptonight_masari:
-	case cryptonight:
-	case cryptonight_superfast: 
-		return CRYPTONIGHT_MEMORY;
-	case cryptonight_ipbc:
-	case cryptonight_aeon:
-	case cryptonight_lite:
-		return CRYPTONIGHT_LITE_MEMORY;
-	case cryptonight_bittube2:
-	case cryptonight_haven:
-	case cryptonight_heavy:
-		return CRYPTONIGHT_HEAVY_MEMORY;
-	default:
-		return 0;
 	}
-}
-
-template<xmrstak_algo ALGO>
-inline constexpr uint32_t cn_select_mask() { return 0; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight>() { return CRYPTONIGHT_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_lite>() { return CRYPTONIGHT_LITE_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_monero>() { return CRYPTONIGHT_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_monero_v8>() { return CRYPTONIGHT_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_aeon>() { return CRYPTONIGHT_LITE_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_ipbc>() { return CRYPTONIGHT_LITE_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_stellite>() { return CRYPTONIGHT_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_masari>() { return CRYPTONIGHT_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_haven>() { return CRYPTONIGHT_HEAVY_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_MASK; }
-
-template<>
-inline constexpr uint32_t cn_select_mask<cryptonight_superfast>() { return CRYPTONIGHT_MASK; } 
 
-inline size_t cn_select_mask(xmrstak_algo algo)
-{
-	switch(algo)
+	/** check if the algorithm is equal to another algorithm
+	 *
+	 * we do not check the member algo_name because this is only an alias name
+	 */
+	bool operator==(const xmrstak_algo& other) const
 	{
-	case cryptonight_stellite:
-	case cryptonight_monero:
-	case cryptonight_monero_v8:
-	case cryptonight_masari:
-	case cryptonight:
-	case cryptonight_superfast: 
-		return CRYPTONIGHT_MASK;
-	case cryptonight_ipbc:
-	case cryptonight_aeon:
-	case cryptonight_lite:
-		return CRYPTONIGHT_LITE_MASK;
-	case cryptonight_bittube2:
-	case cryptonight_haven:
-	case cryptonight_heavy:
-		return CRYPTONIGHT_HEAVY_MASK;
-	default:
-		return 0;
+		return other.Id() == Id() && other.Mem() == Mem() && other.Iter() == Iter() && other.Mask() == Mask();
 	}
-}
 
-template<xmrstak_algo ALGO>
-inline constexpr uint32_t cn_select_iter() { return 0; }
+	bool operator==(const xmrstak_algo_id& id) const
+	{
+		return base_algo == id;
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight>() { return CRYPTONIGHT_ITER; }
+	operator xmrstak_algo_id() const
+	{
+		return base_algo;
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_lite>() { return CRYPTONIGHT_LITE_ITER; }
+	xmrstak_algo_id Id() const
+	{
+		return base_algo;
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_monero>() { return CRYPTONIGHT_ITER; }
+	size_t Mem() const
+	{
+		if(base_algo == invalid_algo)
+			return 0;
+		else
+			return mem;
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_monero_v8>() { return CRYPTONIGHT_ITER; }
+	uint32_t Iter() const
+	{
+		return iter;
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_heavy>() { return CRYPTONIGHT_HEAVY_ITER; }
+	/** Name of the algorithm
+	 *
+	 * This name is only an alias for the native implemented base algorithm.
+	 */
+	std::string Name() const
+	{
+		return get_algo_name(algo_name);
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_aeon>() { return CRYPTONIGHT_LITE_ITER; }
+	/** Name of the parent algorithm
+	 *
+	 * This is the real algorithm which is implemented in all POW functions.
+	 */
+	std::string BaseName() const
+	{
+		return get_algo_name(base_algo);
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_ipbc>() { return CRYPTONIGHT_LITE_ITER; }
+	uint32_t Mask() const
+	{
+		// default is a 16 byte aligne mask
+		if(mask == 0)
+			return ((mem - 1u) / 16) * 16;
+		else
+			return mask;
+	}
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_stellite>() { return CRYPTONIGHT_ITER; }
+	xmrstak_algo_id algo_name = invalid_algo;
+	xmrstak_algo_id base_algo = invalid_algo;
+	uint32_t iter = 0u;
+	size_t mem = 0u;
+	uint32_t mask = 0u;
+};
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_masari>() { return CRYPTONIGHT_MASARI_ITER; }
+// default cryptonight
+constexpr size_t CN_MEMORY = 2 * 1024 * 1024;
+constexpr uint32_t CN_ITER = 0x80000;
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_haven>() { return CRYPTONIGHT_HEAVY_ITER; }
+// crptonight gpu
+constexpr uint32_t CN_GPU_MASK = 0x1FFFC0;
+constexpr uint32_t CN_GPU_ITER = 0xC000;
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_ITER; }
+// cryptonight turtle (the mask is not using the full 256kib scratchpad)
+constexpr uint32_t CN_TURTLE_MASK = 0x1FFF0;
 
-template<>
-inline constexpr uint32_t cn_select_iter<cryptonight_superfast>() { return CRYPTONIGHT_SUPERFAST_ITER; } 
+constexpr uint32_t CN_ZELERIUS_ITER = 0x60000;
 
-inline size_t cn_select_iter(xmrstak_algo algo)
+inline xmrstak_algo POW(xmrstak_algo_id algo_id)
 {
-	switch(algo)
-	{
-	case cryptonight_stellite:
-	case cryptonight_monero:
-	case cryptonight_monero_v8:
-	case cryptonight:
-		return CRYPTONIGHT_ITER;
-	case cryptonight_ipbc:
-	case cryptonight_aeon:
-	case cryptonight_lite:
-		return CRYPTONIGHT_LITE_ITER;
-	case cryptonight_bittube2:
-	case cryptonight_haven:
-	case cryptonight_heavy:
-		return CRYPTONIGHT_HEAVY_ITER;
-	case cryptonight_masari:
-		return CRYPTONIGHT_MASARI_ITER;
-	case cryptonight_superfast:
-		return CRYPTONIGHT_SUPERFAST_ITER;
-	default:
-		return 0;
-	}
+	static std::array<xmrstak_algo, 17> pow = {{
+		{invalid_algo, invalid_algo},
+		{cryptonight, cryptonight, CN_ITER, CN_MEMORY},
+		{cryptonight_lite, cryptonight_lite, CN_ITER/2, CN_MEMORY/2},
+		{cryptonight_monero, cryptonight_monero, CN_ITER, CN_MEMORY},
+		{cryptonight_heavy, cryptonight_heavy, CN_ITER/2, CN_MEMORY*2},
+		{cryptonight_aeon, cryptonight_aeon, CN_ITER/2, CN_MEMORY/2},
+		{cryptonight_ipbc, cryptonight_ipbc, CN_ITER/2, CN_MEMORY/2}, // equal to cryptonight_aeon with a small tweak in the miner code
+		{cryptonight_stellite, cryptonight_stellite, CN_ITER, CN_MEMORY}, //equal to cryptonight_monero but with one tiny change
+		{cryptonight_masari, cryptonight_masari, CN_ITER/2, CN_MEMORY}, //equal to cryptonight_monero but with less iterations, used by masari
+		{cryptonight_haven, cryptonight_haven, CN_ITER/2, CN_MEMORY*2}, // equal to cryptonight_heavy with a small tweak
+		{cryptonight_bittube2, cryptonight_bittube2, CN_ITER/2, CN_MEMORY*2}, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
+		{cryptonight_monero_v8, cryptonight_monero_v8, CN_ITER, CN_MEMORY},
+		{cryptonight_superfast, cryptonight_superfast, CN_ITER/4, CN_MEMORY},
+		{cryptonight_gpu, cryptonight_gpu, CN_GPU_ITER, CN_MEMORY, CN_GPU_MASK},
+		{cryptonight_conceal, cryptonight_conceal, CN_ITER/2, CN_MEMORY},
+		{cryptonight_r_wow, cryptonight_r_wow, CN_ITER, CN_MEMORY},
+		{cryptonight_r, cryptonight_r, CN_ITER, CN_MEMORY}
+	}};
+
+	static std::array<xmrstak_algo, 3> derived_pow =
+	{{
+		{cryptonight_turtle, cryptonight_monero_v8, CN_ITER/8, CN_MEMORY/8, CN_TURTLE_MASK},
+		{cryptonight_v8_half, cryptonight_monero_v8, CN_ITER/2, CN_MEMORY},
+		{cryptonight_v8_zelerius, cryptonight_monero_v8, CN_ZELERIUS_ITER, CN_MEMORY}
+		// {cryptonight_derived}
+	}};
+
+	if(algo_id < start_derived_algo_id)
+		return pow[algo_id];
+	else
+		return derived_pow[algo_id - start_derived_algo_id];
 }
diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp
index 4eeed3c4b..52ef3f391 100644
--- a/xmrstak/backend/globalStates.cpp
+++ b/xmrstak/backend/globalStates.cpp
@@ -33,7 +33,7 @@
 namespace xmrstak
 {
 
-void globalStates::consume_work( miner_work& threadWork, uint64_t& currentJobId)
+void globalStates::consume_work(miner_work& threadWork, uint64_t& currentJobId)
 {
 	jobLock.ReadLock();
 
@@ -43,7 +43,7 @@ void globalStates::consume_work( miner_work& threadWork, uint64_t& currentJobId)
 	jobLock.UnLock();
 }
 
-void globalStates::switch_work(miner_work& pWork, pool_data& dat)
+void globalStates::switch_work(miner_work&& pWork, pool_data& dat)
 {
 	jobLock.WriteLock();
 
@@ -61,7 +61,7 @@ void globalStates::switch_work(miner_work& pWork, pool_data& dat)
 	 * after the nonce is read.
 	 */
 	dat.iSavedNonce = iGlobalNonce.exchange(dat.iSavedNonce, std::memory_order_relaxed);
-	oGlobalWork = pWork;
+	oGlobalWork = std::move(pWork);
 
 	jobLock.UnLock();
 }
diff --git a/xmrstak/backend/globalStates.hpp b/xmrstak/backend/globalStates.hpp
index c8d691712..d6966c4a2 100644
--- a/xmrstak/backend/globalStates.hpp
+++ b/xmrstak/backend/globalStates.hpp
@@ -22,7 +22,7 @@ struct globalStates
 	}
 
 	//pool_data is in-out winapi style
-	void switch_work(miner_work& pWork, pool_data& dat);
+	void switch_work(miner_work&& pWork, pool_data& dat);
 
 	inline void calc_start_nonce(uint32_t& nonce, bool use_nicehash, uint32_t reserve_count)
 	{
diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp
index b6456f031..c8174df32 100644
--- a/xmrstak/backend/miner_work.hpp
+++ b/xmrstak/backend/miner_work.hpp
@@ -21,29 +21,40 @@ namespace xmrstak
 		bool        bNiceHash;
 		bool        bStall;
 		size_t      iPoolId;
+		uint64_t	iBlockHeight;
+		uint8_t*	ref_ptr;
 
-		miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(invalid_pool_id) { }
+		miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(invalid_pool_id), ref_ptr((uint8_t*)&iBlockHeight) { }
 
 		miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize,
-			uint64_t iTarget, bool bNiceHash, size_t iPoolId) : iWorkSize(iWorkSize),
-			iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId)
+			uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) : iWorkSize(iWorkSize),
+			iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId), iBlockHeight(iBlockHeiht), ref_ptr((uint8_t*)&iBlockHeight) 
 		{
 			assert(iWorkSize <= sizeof(bWorkBlob));
-			memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
 			memcpy(this->bWorkBlob, bWork, iWorkSize);
+			memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
+		}
+
+		miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget),
+			bStall(from.bStall), iPoolId(from.iPoolId), iBlockHeight(from.iBlockHeight), ref_ptr((uint8_t*)&iBlockHeight) 
+		{
+			assert(iWorkSize <= sizeof(bWorkBlob));
+			memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
+			memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
 		}
 
 		miner_work(miner_work const&) = delete;
 
-		miner_work& operator=(miner_work const& from)
+		miner_work& operator=(miner_work&& from)
 		{
 			assert(this != &from);
 
+			iBlockHeight = from.iBlockHeight;
+			iPoolId = from.iPoolId;
+			bStall = from.bStall;
 			iWorkSize = from.iWorkSize;
-			iTarget = from.iTarget;
 			bNiceHash = from.bNiceHash;
-			bStall = from.bStall;
-			iPoolId = from.iPoolId;
+			iTarget = from.iTarget;
 
 			assert(iWorkSize <= sizeof(bWorkBlob));
 			memcpy(sJobID, from.sJobID, sizeof(sJobID));
@@ -52,23 +63,22 @@ namespace xmrstak
 			return *this;
 		}
 
-		miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget),
-			bStall(from.bStall), iPoolId(from.iPoolId)
-		{
-			assert(iWorkSize <= sizeof(bWorkBlob));
-			memcpy(sJobID, from.sJobID, sizeof(sJobID));
-			memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
-		}
-
-		miner_work& operator=(miner_work&& from)
+		miner_work& operator=(miner_work const& from)
 		{
 			assert(this != &from);
 
+			iBlockHeight = from.iBlockHeight;
+			iPoolId = from.iPoolId;
+			bStall = from.bStall;
 			iWorkSize = from.iWorkSize;
-			iTarget = from.iTarget;
 			bNiceHash = from.bNiceHash;
-			bStall = from.bStall;
-			iPoolId = from.iPoolId;
+			iTarget = from.iTarget;
+
+			if(!ref_ptr)
+				return *this;
+
+			for(size_t i=0; i <= 7 && iPoolId; i++)
+				ref_ptr[i] = from.ref_ptr[7-i];
 
 			assert(iWorkSize <= sizeof(bWorkBlob));
 			memcpy(sJobID, from.sJobID, sizeof(sJobID));
diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp
new file mode 100644
index 000000000..87eb05540
--- /dev/null
+++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.cpp
@@ -0,0 +1,336 @@
+/*
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <string>
+#include <sstream>
+#include <mutex>
+#include <cstring>
+#include <nvrtc.h>
+#include <thread>
+
+#include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp"
+#include "xmrstak/backend/cpu/crypto/variant4_random_math.h"
+#include "xmrstak/misc/console.hpp"
+#include "xmrstak/cpputil/read_write_lock.h"
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+static std::string get_code(const V4_Instruction* code, int code_size)
+{
+    std::stringstream s;
+
+    for (int i = 0; i < code_size; ++i)
+    {
+        const V4_Instruction inst = code[i];
+
+        const uint32_t a = inst.dst_index;
+        const uint32_t b = inst.src_index;
+
+        switch (inst.opcode)
+        {
+        case MUL:
+            s << 'r' << a << "*=r" << b << ';';
+            break;
+
+        case ADD:
+            s << 'r' << a << "+=r" << b << '+' << inst.C << "U;";
+            break;
+
+        case SUB:
+            s << 'r' << a << "-=r" << b << ';';
+            break;
+
+        case ROR:
+            s << 'r' << a << "=rotate_right(r" << a << ",r" << b << ");";
+            break;
+
+        case ROL:
+            s << 'r' << a << "=rotate_left(r" << a << ",r" << b << ");";
+            break;
+
+        case XOR:
+            s << 'r' << a << "^=r" << b << ';';
+            break;
+        }
+
+        s << '\n';
+    }
+
+    return s.str();
+}
+
+struct CacheEntry
+{
+    CacheEntry(xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, const std::vector<char>& ptx, const std::string& lowered_name) :
+        algo(algo),
+        height(height),
+        arch_major(arch_major),
+        arch_minor(arch_minor),
+        ptx(ptx),
+        lowered_name(lowered_name)
+    {}
+
+    xmrstak_algo algo;
+    uint64_t height;
+    int arch_major;
+    int arch_minor;
+    std::vector<char> ptx;
+    std::string lowered_name;
+};
+
+struct BackgroundTaskBase
+{
+    virtual ~BackgroundTaskBase() {}
+    virtual void exec() = 0;
+};
+
+template<typename T>
+struct BackgroundTask : public BackgroundTaskBase
+{
+    BackgroundTask(T&& func) : m_func(std::move(func)) {}
+    void exec() override { m_func(); }
+
+    T m_func;
+};
+
+static ::cpputil::RWLock CryptonightR_cache_mutex;
+static std::mutex CryptonightR_build_mutex;
+static std::vector<CacheEntry> CryptonightR_cache;
+
+static std::mutex background_tasks_mutex;
+static std::vector<BackgroundTaskBase*> background_tasks;
+static std::thread* background_thread = nullptr;
+
+static void background_thread_proc()
+{
+    std::vector<BackgroundTaskBase*> tasks;
+    for (;;) {
+        tasks.clear();
+        {
+            std::lock_guard<std::mutex> g(background_tasks_mutex);
+            background_tasks.swap(tasks);
+        }
+
+        for (BackgroundTaskBase* task : tasks) {
+            task->exec();
+            delete task;
+        }
+
+        std::this_thread::sleep_for(std::chrono::milliseconds(500));
+    }
+}
+
+template<typename T>
+static void background_exec(T&& func)
+{
+    BackgroundTaskBase* task = new BackgroundTask<T>(std::move(func));
+
+    std::lock_guard<std::mutex> g(background_tasks_mutex);
+    background_tasks.push_back(task);
+    if (!background_thread) {
+        background_thread = new std::thread(background_thread_proc);
+    }
+}
+
+static void CryptonightR_build_program(
+    std::vector<char>& ptx,
+    std::string& lowered_name,
+    const xmrstak_algo& algo,
+    uint64_t height,
+    int arch_major,
+    int arch_minor,
+    std::string source)
+{
+    {
+		CryptonightR_cache_mutex.WriteLock();
+
+        // Remove old programs from cache
+        for (size_t i = 0; i < CryptonightR_cache.size();)
+        {
+            const CacheEntry& entry = CryptonightR_cache[i];
+            if ((entry.algo == algo) && (entry.height + 2 < height))
+            {
+                printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu released (old program)", entry.height);
+                CryptonightR_cache[i] = std::move(CryptonightR_cache.back());
+                CryptonightR_cache.pop_back();
+            }
+            else
+            {
+                ++i;
+            }
+        }
+		CryptonightR_cache_mutex.UnLock();
+    }
+
+    ptx.clear();
+    ptx.reserve(65536);
+
+    std::lock_guard<std::mutex> g1(CryptonightR_build_mutex);
+    {
+		CryptonightR_cache_mutex.ReadLock();
+
+        // Check if the cache already has this program (some other thread might have added it first)
+        for (const CacheEntry& entry : CryptonightR_cache)
+        {
+            if ((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor))
+            {
+                ptx = entry.ptx;
+                lowered_name = entry.lowered_name;
+				CryptonightR_cache_mutex.UnLock();
+                return;
+            }
+        }
+		CryptonightR_cache_mutex.UnLock();
+    }
+
+    nvrtcProgram prog;
+    nvrtcResult result = nvrtcCreateProgram(&prog, source.c_str(), "CryptonightR.curt", 0, NULL, NULL);
+    if (result != NVRTC_SUCCESS) {
+        printer::inst()->print_msg(L0, "nvrtcCreateProgram failed: %s", nvrtcGetErrorString(result));
+        return;
+    }
+
+    result = nvrtcAddNameExpression(prog, "CryptonightR_phase2");
+    if (result != NVRTC_SUCCESS) {
+        printer::inst()->print_msg(L0, "nvrtcAddNameExpression failed: %s", nvrtcGetErrorString(result));
+        nvrtcDestroyProgram(&prog);
+        return;
+    }
+
+    char opt0[64];
+    sprintf(opt0, "--gpu-architecture=compute_%d%d", arch_major, arch_minor);
+
+    char opt1[64];
+    sprintf(opt1, "-DALGO=%d", static_cast<int>(algo.Id()));
+
+	const char* opts[2] = { opt0, opt1 };
+
+    result = nvrtcCompileProgram(prog, 2, opts);
+    if (result != NVRTC_SUCCESS) {
+        printer::inst()->print_msg(L0, "nvrtcCompileProgram failed: %s", nvrtcGetErrorString(result));
+
+        size_t logSize;
+        if (nvrtcGetProgramLogSize(prog, &logSize) == NVRTC_SUCCESS) {
+            char *log = new char[logSize];
+            if (nvrtcGetProgramLog(prog, log) == NVRTC_SUCCESS) {
+                printer::inst()->print_msg(L0, "Program compile log: %s", log);
+            }
+            delete[]log;
+        }
+        nvrtcDestroyProgram(&prog);
+        return;
+    }
+
+
+    const char* name;
+    result = nvrtcGetLoweredName(prog, "CryptonightR_phase2", &name);
+    if (result != NVRTC_SUCCESS) {
+        printer::inst()->print_msg(L0, "nvrtcGetLoweredName failed: %s", nvrtcGetErrorString(result));
+        nvrtcDestroyProgram(&prog);
+        return;
+    }
+
+    size_t ptxSize;
+    result = nvrtcGetPTXSize(prog, &ptxSize);
+    if (result != NVRTC_SUCCESS) {
+        printer::inst()->print_msg(L0, "nvrtcGetPTXSize failed: %s", nvrtcGetErrorString(result));
+        nvrtcDestroyProgram(&prog);
+        return;
+    }
+
+    ptx.resize(ptxSize);
+    result = nvrtcGetPTX(prog, ptx.data());
+    if (result != NVRTC_SUCCESS) {
+        printer::inst()->print_msg(L0, "nvrtcGetPTX failed: %s", nvrtcGetErrorString(result));
+        nvrtcDestroyProgram(&prog);
+        return;
+    }
+
+    lowered_name = name;
+
+    nvrtcDestroyProgram(&prog);
+
+    printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu compiled", height);
+
+	CryptonightR_cache_mutex.WriteLock();
+	CryptonightR_cache.emplace_back(algo, height, arch_major, arch_minor, ptx, lowered_name);
+	CryptonightR_cache_mutex.UnLock();
+}
+
+void CryptonightR_get_program(std::vector<char>& ptx, std::string& lowered_name, const xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, bool background)
+{
+    if (background) {
+        background_exec([=]() { std::vector<char> tmp; std::string s; CryptonightR_get_program(tmp, s, algo, height, arch_major, arch_minor, false); });
+        return;
+    }
+
+    ptx.clear();
+
+    const char* source_code_template =
+        #include "nvcc_code/cuda_cryptonight_r.curt"
+    ;
+    const char include_name[] = "XMRSTAK_INCLUDE_RANDOM_MATH";
+    const char* offset = strstr(source_code_template, include_name);
+    if (!offset)
+    {
+        printer::inst()->print_msg(L0, "CryptonightR_get_program: XMRSTAK_INCLUDE_RANDOM_MATH not found in cuda_cryptonight_r.curt");
+        return;
+    }
+
+    V4_Instruction code[256];
+    int code_size;
+    switch (algo.Id())
+    {
+    case cryptonight_r_wow:
+        code_size = v4_random_math_init<cryptonight_r_wow>(code, height);
+        break;
+    case cryptonight_r:
+        code_size = v4_random_math_init<cryptonight_r>(code, height);
+        break;
+        printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo);
+        return;
+    }
+
+    std::string source_code(source_code_template, offset);
+    source_code.append(get_code(code, code_size));
+    source_code.append(offset + sizeof(include_name) - 1);
+
+    {
+		CryptonightR_cache_mutex.ReadLock();
+
+        // Check if the cache has this program
+        for (const CacheEntry& entry : CryptonightR_cache)
+        {
+            if ((entry.algo == algo) && (entry.height == height) && (entry.arch_major == arch_major) && (entry.arch_minor == arch_minor))
+            {
+                printer::inst()->print_msg(LDEBUG, "CryptonightR: program for height %llu found in cache", height);
+                ptx = entry.ptx;
+                lowered_name = entry.lowered_name;
+				CryptonightR_cache_mutex.UnLock();
+                return;
+            }
+        }
+		CryptonightR_cache_mutex.UnLock();
+    }
+
+    CryptonightR_build_program(ptx, lowered_name, algo, height, arch_major, arch_minor, source_code);
+}
+
+} // namespace xmrstak
+} //namespace nvidia
diff --git a/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp
new file mode 100644
index 000000000..e214647b9
--- /dev/null
+++ b/xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp
@@ -0,0 +1,37 @@
+/*
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#pragma once
+
+#include "xmrstak/backend/cryptonight.hpp"
+
+#include <stdint.h>
+#include <vector>
+#include <string>
+
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+void CryptonightR_get_program(std::vector<char>& ptx, std::string& lowered_name,
+	const xmrstak_algo algo, uint64_t height, int arch_major, int arch_minor, bool background = false);
+
+
+} // namespace xmrstak
+} //namespace nvidia
+
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index 6460628de..794e68d11 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -173,6 +173,8 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 	size_t i, n = jconf::inst()->GetGPUThreadCount();
 	pvThreads->reserve(n);
 
+	cuInit(0);
+
 	jconf::thd_cfg cfg;
 	for (i = 0; i < n; i++)
 	{
@@ -226,7 +228,10 @@ void minethd::work_main()
 
 	// start with root algorithm and switch later if fork version is reached
 	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot();
-	cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+
+	cn_hash_fun hash_fun;
+	cpu::minethd::cn_on_new_job set_job;
+	cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 
 	uint32_t iNonce;
 
@@ -255,17 +260,20 @@ void minethd::work_main()
 			if(new_version >= coinDesc.GetMiningForkVersion())
 			{
 				miner_algo = coinDesc.GetMiningAlgo();
-				hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+				cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 			}
 			else
 			{
 				miner_algo = coinDesc.GetMiningAlgoRoot();
-				hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+				cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 			}
 			lastPoolId = oWork.iPoolId;
 			version = new_version;
 		}
 
+		if(set_job != nullptr)
+			set_job(oWork, &cpu_ctx);
+
 		cryptonight_extra_cpu_set_data(&ctx, oWork.bWorkBlob, oWork.iWorkSize);
 
 		uint32_t h_per_round = ctx.device_blocks * ctx.device_threads;
@@ -292,7 +300,7 @@ void minethd::work_main()
 
 			cryptonight_extra_cpu_prepare(&ctx, iNonce, miner_algo);
 
-			cryptonight_core_cpu_hash(&ctx, miner_algo, iNonce);
+			cryptonight_core_cpu_hash(&ctx, miner_algo, iNonce, cpu_ctx->cn_r_ctx.height);
 
 			cryptonight_extra_cpu_final(&ctx, iNonce, oWork.iTarget, &foundCount, foundNonce, miner_algo);
 
@@ -307,7 +315,7 @@ void minethd::work_main()
 
 				*(uint32_t*)(bWorkBlob + 39) = foundNonce[i];
 
-				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx);
+				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
 				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
 					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, foundNonce[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else
diff --git a/xmrstak/backend/nvidia/minethd.hpp b/xmrstak/backend/nvidia/minethd.hpp
index 389356842..3863c93e8 100644
--- a/xmrstak/backend/nvidia/minethd.hpp
+++ b/xmrstak/backend/nvidia/minethd.hpp
@@ -28,7 +28,7 @@ class minethd : public iBackend
 	static bool self_test();
 
 private:
-	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**);
+	typedef void (*cn_hash_fun)(const void*, size_t, void*, cryptonight_ctx**, const xmrstak_algo&);
 
 	minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg);
 	void start_mining();
diff --git a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
index 8fda8d401..fe77b6f81 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cryptonight.hpp
@@ -6,6 +6,8 @@
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/backend/cryptonight.hpp"
 
+#include <cuda.h>
+
 typedef struct {
 	int device_id;
 	const char *device_name;
@@ -33,6 +35,13 @@ typedef struct {
 	std::string name;
 	size_t free_device_memory;
 	size_t total_device_memory;
+
+	CUdevice cuDevice;
+	CUcontext cuContext;
+	CUmodule module = nullptr;
+	CUfunction kernel = nullptr;
+	uint64_t kernel_height = 0;
+	xmrstak_algo cached_algo = {xmrstak_algo_id::invalid_algo};
 } nvid_ctx;
 
 extern "C" {
@@ -46,8 +55,8 @@ int cuda_get_devicecount( int* deviceCount);
 int cuda_get_deviceinfo(nvid_ctx *ctx);
 int cryptonight_extra_cpu_init(nvid_ctx *ctx);
 void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data, uint32_t len);
-void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, xmrstak_algo miner_algo);
-void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce,xmrstak_algo miner_algo);
+void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo);
+void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo);
 }
 
-void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce);
+void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, uint32_t startNonce, uint64_t chain_height);
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 87c1befa8..d082f3362 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -10,6 +10,8 @@
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp"
 #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp"
+#include "xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp"
+#include "xmrstak/backend/nvidia/CudaCryptonightR_gen.hpp"
 
 
 #ifdef _WIN32
@@ -127,8 +129,9 @@ __device__ __forceinline__ uint32_t rotate16( const uint32_t n )
 	return (n >> 16u) | (n << 16u);
 }
 
-template<size_t ITERATIONS, uint32_t MEMORY>
-__global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 )
+__global__ void cryptonight_core_gpu_phase1(
+	const uint32_t ITERATIONS,  const size_t MEMORY,
+	int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 )
 {
 	__shared__ uint32_t sharedMemory[1024];
 
@@ -266,11 +269,13 @@ struct u64 : public uint2
  * @tparam MEM_MODE if `0` than 64bit memory transfers per thread will be used to store/load data within shared memory
  *                   else if `1` 256bit operations will be used
  */
-template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO, uint32_t MEM_MODE>
+template<xmrstak_algo_id ALGO, uint32_t MEM_MODE>
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 2 )
 #endif
-__global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
+__global__ void cryptonight_core_gpu_phase2_double(
+	const uint32_t ITERATIONS,  const size_t MEMORY, const uint32_t MASK,
+	int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
 		uint32_t startNonce, uint32_t * __restrict__ d_input )
 {
 	__shared__ uint32_t sharedMemory[512];
@@ -312,11 +317,11 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 	uint64_t division_result;
 	if(ALGO == cryptonight_monero_v8)
 	{
-		bx0 = ((uint64_t*)(d_ctx_b + thread * 12))[sub];
-		bx1 = ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub];
+		bx0 = ((uint64_t*)(d_ctx_b + thread * 16))[sub];
+		bx1 = ((uint64_t*)(d_ctx_b + thread * 16 + 4))[sub];
 
-		division_result = ((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0];
-		sqrt_result = (d_ctx_b + thread * 12 + 4 * 2 + 2)[0];
+		division_result = ((uint64_t*)(d_ctx_b + thread * 16 + 4 * 2))[0];
+		sqrt_result = (d_ctx_b + thread * 16 + 4 * 2 + 2)[0];
 	}
 	else
 		 bx0 = ((uint64_t*)(d_ctx_b + thread * 4))[sub];
@@ -400,7 +405,7 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0);
 		((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0);
 
-		if(ALGO == cryptonight_monero_v8 && sub == 1)
+		if((ALGO == cryptonight_monero_v8) && sub == 1)
 		{
 			// Use division and square root results from the _previous_ iteration to hide the latency
 			((uint32_t*)&division_result)[1] ^= sqrt_result;
@@ -466,14 +471,14 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0;
 		if(ALGO == cryptonight_monero_v8)
 		{
-			((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0;
-			((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub] = bx1;
+			((uint64_t*)(d_ctx_b + thread * 16))[sub] = bx0;
+			((uint64_t*)(d_ctx_b + thread * 16 + 4))[sub] = bx1;
 
 			if(sub == 1)
 			{
 				// must be valid only for `sub == 1`
-				((uint64_t*)(d_ctx_b + thread * 12 + 4 * 2))[0] = division_result;
-				(d_ctx_b + thread * 12 + 4 * 2 + 2)[0] = sqrt_result;
+				((uint64_t*)(d_ctx_b + thread * 16 + 4 * 2))[0] = division_result;
+				(d_ctx_b + thread * 16 + 4 * 2 + 2)[0] = sqrt_result;
 			}
 		}
 		else
@@ -481,11 +486,13 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 	}
 }
 
-template<size_t ITERATIONS, uint32_t MEMORY, uint32_t MASK, xmrstak_algo ALGO>
+template<xmrstak_algo_id ALGO>
 #ifdef XMR_STAK_THREADS
 __launch_bounds__( XMR_STAK_THREADS * 4 )
 #endif
-__global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
+__global__ void cryptonight_core_gpu_phase2_quad(
+	const uint32_t ITERATIONS,  const size_t MEMORY, const uint32_t MASK,
+	int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
 		uint32_t startNonce, uint32_t * __restrict__ d_input )
 {
 	__shared__ uint32_t sharedMemory[1024];
@@ -517,6 +524,15 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int
 	uint32_t a, d[2], idx0;
 	uint32_t t1[2], t2[2], res;
 
+	float conc_var;
+	if(ALGO == cryptonight_conceal)
+	{
+		if(partidx != 0)
+			conc_var = int_as_float(*(d_ctx_b + threads * 4 + thread * 4 + sub));
+		else
+			conc_var = 0.0f;
+	}
+
 	uint32_t tweak1_2[2];
 	if (ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
 	{
@@ -579,7 +595,23 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int
 			}
 			else
 			{
-				const uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
+				uint32_t x_0 = loadGlobal32<uint32_t>( long_state + j );
+
+				if(ALGO == cryptonight_conceal)
+				{
+					float r = int2float((int32_t)x_0);
+					float c_old = conc_var;
+
+					r += conc_var;
+					r = r * r * r;
+					r = int_as_float((float_as_int(r) & 0x807FFFFF) | 0x40000000);
+					conc_var += r;
+
+					c_old = int_as_float((float_as_int(c_old) & 0x807FFFFF) | 0x40000000);
+					c_old *= 536870880.0f;
+					x_0 = (uint32_t)(((int32_t)x_0) ^ ((int32_t)c_old));
+				}
+
 				const uint32_t x_1 = shuffle<4>(sPtr,sub, x_0, sub + 1);
 				const uint32_t x_2 = shuffle<4>(sPtr,sub, x_0, sub + 2);
 				const uint32_t x_3 = shuffle<4>(sPtr,sub, x_0, sub + 3);
@@ -681,11 +713,15 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int
 		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 			if(sub&1)
 				*(d_ctx_b + threads * 4 + thread) = idx0;
+		if(ALGO == cryptonight_conceal)
+			*(d_ctx_b + threads * 4 + thread * 4 + sub) = float_as_int(conc_var);
 	}
 }
 
-template<size_t ITERATIONS, uint32_t MEMORY, xmrstak_algo ALGO>
-__global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 )
+template<xmrstak_algo_id ALGO>
+__global__ void cryptonight_core_gpu_phase3(
+	const uint32_t ITERATIONS,  const size_t MEMORY,
+	int threads, int bfactor, int partidx, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2 )
 {
 	__shared__ uint32_t sharedMemory[1024];
 
@@ -724,7 +760,8 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 
 		cn_aes_pseudo_round_mut( sharedMemory, text, key );
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+		if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
+			ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 		{
 			#pragma unroll
 			for ( int j = 0; j < 4; ++j )
@@ -735,9 +772,13 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 	MEMCPY8( d_ctx_state + thread * 50 + sub + 16, text, 2 );
 }
 
-template<size_t ITERATIONS, uint32_t MASK, uint32_t MEMORY, xmrstak_algo ALGO, uint32_t MEM_MODE>
-void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
+template<xmrstak_algo_id ALGO, uint32_t MEM_MODE>
+void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo)
 {
+	uint32_t MASK = algo.Mask();
+	uint32_t ITERATIONS = algo.Iter();
+	size_t MEM = algo.Mem()/4;
+
 	dim3 grid( ctx->device_blocks );
 	dim3 block( ctx->device_threads );
 	dim3 block2( ctx->device_threads << 1 );
@@ -759,7 +800,10 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 
 	for ( int i = 0; i < partcountOneThree; i++ )
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<ITERATIONS,MEMORY><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>(
+			ITERATIONS,
+			MEM,
+			ctx->device_blocks*ctx->device_threads,
 			bfactorOneThree, i,
 			ctx->d_long_state,
 			(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state),
@@ -777,13 +821,16 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 			CUDA_CHECK_MSG_KERNEL(
 				ctx->device_id,
 				"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
-				cryptonight_core_gpu_phase2_double<ITERATIONS,MEMORY,MASK,ALGO, MEM_MODE><<<
+				cryptonight_core_gpu_phase2_double<ALGO, MEM_MODE><<<
 					grid,
 					block2,
-					sizeof(uint64_t) * block2.x * 8 +
+					sizeof(uint64_t) * block.x * 8 +
 						// shuffle memory for fermi gpus
 						block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
 				>>>(
+					ITERATIONS,
+					MEM,
+					MASK,
 					ctx->device_blocks*ctx->device_threads,
 					ctx->device_bfactor,
 					i,
@@ -796,16 +843,39 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 				)
 			);
 		}
+		else if(ALGO == cryptonight_r_wow || ALGO == cryptonight_r)
+		{
+			int numThreads = ctx->device_blocks*ctx->device_threads;
+			void* args[] = {
+				&ITERATIONS, &MEM, &MASK,
+				&numThreads, &ctx->device_bfactor, &i,
+				&ctx->d_long_state, &ctx->d_ctx_a, &ctx->d_ctx_b, &ctx->d_ctx_state, &nonce, &ctx->d_input
+			};
+			CU_CHECK(ctx->device_id, cuLaunchKernel(
+				ctx->kernel,
+				grid.x, grid.y, grid.z,
+				block2.x, block2.y, block2.z,
+				sizeof(uint64_t) * block.x * 8 +
+						// shuffle memory for fermi gpus
+						block2.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 ),
+				nullptr,
+				args, 0
+			));
+			CU_CHECK(ctx->device_id, cuCtxSynchronize());
+		}
 		else
 		{
 			CUDA_CHECK_MSG_KERNEL(
 				ctx->device_id,
 				"\n**suggestion: Try to increase the value of the attribute 'bfactor' or \nreduce 'threads' in the NVIDIA config file.**",
-				cryptonight_core_gpu_phase2_quad<ITERATIONS,MEMORY,MASK,ALGO><<<
+				cryptonight_core_gpu_phase2_quad<ALGO><<<
 					grid,
 					block4,
 					block4.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
 				>>>(
+					ITERATIONS,
+					MEM,
+					MASK,
 					ctx->device_blocks*ctx->device_threads,
 					ctx->device_bfactor,
 					i,
@@ -832,64 +902,179 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 
 	for ( int i = 0; i < roundsPhase3; i++ )
 	{
-		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ITERATIONS,MEMORY, ALGO><<<
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ALGO><<<
+			grid,
+			block8,
+			block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
+		>>>(
+			ITERATIONS,
+			MEM,
+			ctx->device_blocks*ctx->device_threads,
+			bfactorOneThree, i,
+			ctx->d_long_state,
+			ctx->d_ctx_state, ctx->d_ctx_key2 ));
+	}
+}
+
+template<xmrstak_algo_id ALGO, uint32_t MEM_MODE>
+void cryptonight_core_gpu_hash_gpu(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo)
+{
+	const uint32_t MASK = algo.Mask();
+	const uint32_t ITERATIONS = algo.Iter();
+	const size_t MEM = algo.Mem();
+
+	dim3 grid( ctx->device_blocks );
+	dim3 block( ctx->device_threads );
+	dim3 block2( ctx->device_threads << 1 );
+	dim3 block4( ctx->device_threads << 2 );
+	dim3 block8( ctx->device_threads << 3 );
+
+	size_t intensity = ctx->device_blocks * ctx->device_threads;
+
+	CUDA_CHECK_KERNEL(
+		ctx->device_id,
+		xmrstak::nvidia::cn_explode_gpu<<<intensity,32>>>(MEM, (int*)ctx->d_ctx_state, (int*)ctx->d_long_state)
+	);
+
+	int partcount = 1 << ctx->device_bfactor;
+	for(int i = 0; i < partcount; i++)
+	{
+		CUDA_CHECK_KERNEL(
+			ctx->device_id,
+			// 36 x 16byte x numThreads
+			xmrstak::nvidia::cryptonight_core_gpu_phase2_gpu
+				<<<ctx->device_blocks, ctx->device_threads * 16,  32 * 16 * ctx->device_threads>>>
+				(
+					ITERATIONS,
+					MEM,
+					MASK,
+					(int*)ctx->d_ctx_state,
+					(int*)ctx->d_long_state,
+					ctx->device_bfactor,
+					i,
+					ctx->d_ctx_a,
+					ctx->d_ctx_b
+				)
+		);
+	}
+
+	/* bfactor for phase 3
+	 *
+	 * 3 consume less time than phase 2, therefore we begin with the
+	 * kernel splitting if the user defined a `bfactor >= 5`
+	 */
+	int bfactorOneThree = ctx->device_bfactor - 4;
+	if( bfactorOneThree < 0 )
+		bfactorOneThree = 0;
+
+	int partcountOneThree = 1 << bfactorOneThree;
+	int roundsPhase3 = partcountOneThree;
+
+	if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
+		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast )
+	{
+		// cryptonight_heavy used two full rounds over the scratchpad memory
+		roundsPhase3 *= 2;
+	}
+
+	for ( int i = 0; i < roundsPhase3; i++ )
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase3<ALGO><<<
 			grid,
 			block8,
 			block8.x * sizeof(uint32_t) * static_cast< int >( ctx->device_arch[0] < 3 )
-		>>>( ctx->device_blocks*ctx->device_threads,
+		>>>(
+			ITERATIONS,
+			MEM/4,
+			ctx->device_blocks*ctx->device_threads,
 			bfactorOneThree, i,
 			ctx->d_long_state,
 			ctx->d_ctx_state, ctx->d_ctx_key2 ));
 	}
 }
 
-void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce)
+void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, uint32_t startNonce, uint64_t chain_height)
 {
-	typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce);
+
+	if((miner_algo == cryptonight_r_wow) || (miner_algo == cryptonight_r))
+	{
+		if(ctx->kernel_height != chain_height || ctx->cached_algo != miner_algo)
+		{
+			 if(ctx->module)
+				cuModuleUnload(ctx->module);
+
+			std::vector<char> ptx;
+			std::string lowered_name;
+			xmrstak::nvidia::CryptonightR_get_program(ptx, lowered_name, miner_algo, chain_height, ctx->device_arch[0], ctx->device_arch[1]);
+
+			CU_CHECK(ctx->device_id, cuModuleLoadDataEx(&ctx->module, ptx.data(), 0, 0, 0));
+			CU_CHECK(ctx->device_id, cuModuleGetFunction(&ctx->kernel, ctx->module, lowered_name.c_str()));
+
+			ctx->kernel_height = chain_height;
+			ctx->cached_algo = miner_algo;
+
+			xmrstak::nvidia::CryptonightR_get_program(ptx, lowered_name, miner_algo, chain_height + 1, ctx->device_arch[0], ctx->device_arch[1], true);
+		}
+	}
+
+	typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo& algo);
 
 	if(miner_algo == invalid_algo) return;
 
 	static const cuda_hash_fn func_table[] = {
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 1>,
+		cryptonight_core_gpu_hash<cryptonight, 0>,
+		cryptonight_core_gpu_hash<cryptonight, 1>,
+
+		cryptonight_core_gpu_hash<cryptonight_lite, 0>,
+		cryptonight_core_gpu_hash<cryptonight_lite, 1>,
+
+		cryptonight_core_gpu_hash<cryptonight_monero, 0>,
+		cryptonight_core_gpu_hash<cryptonight_monero, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_lite, 1>,
+		cryptonight_core_gpu_hash<cryptonight_heavy, 0>,
+		cryptonight_core_gpu_hash<cryptonight_heavy, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero, 1>,
+		cryptonight_core_gpu_hash<cryptonight_aeon, 0>,
+		cryptonight_core_gpu_hash<cryptonight_aeon, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_heavy, 1>,
+		cryptonight_core_gpu_hash<cryptonight_ipbc, 0>,
+		cryptonight_core_gpu_hash<cryptonight_ipbc, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_aeon, 1>,
+		cryptonight_core_gpu_hash<cryptonight_stellite, 0>,
+		cryptonight_core_gpu_hash<cryptonight_stellite, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_LITE_ITER, CRYPTONIGHT_LITE_MASK, CRYPTONIGHT_LITE_MEMORY/4, cryptonight_ipbc, 1>,
+		cryptonight_core_gpu_hash<cryptonight_masari, 0>,
+		cryptonight_core_gpu_hash<cryptonight_masari, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_stellite, 1>,
+		cryptonight_core_gpu_hash<cryptonight_haven, 0>,
+		cryptonight_core_gpu_hash<cryptonight_haven, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_MASARI_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_masari, 1>,
+		cryptonight_core_gpu_hash<cryptonight_bittube2, 0>,
+		cryptonight_core_gpu_hash<cryptonight_bittube2, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_haven, 1>,
+		cryptonight_core_gpu_hash<cryptonight_monero_v8, 0>,
+		cryptonight_core_gpu_hash<cryptonight_monero_v8, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, 1>,
+		cryptonight_core_gpu_hash<cryptonight_superfast, 0>,
+		cryptonight_core_gpu_hash<cryptonight_superfast, 1>,
 
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 1>,
-    
-		cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 1>
+		cryptonight_core_gpu_hash_gpu<cryptonight_gpu, 0>,
+		cryptonight_core_gpu_hash_gpu<cryptonight_gpu, 1>,
+
+		cryptonight_core_gpu_hash<cryptonight_conceal, 0>,
+		cryptonight_core_gpu_hash<cryptonight_conceal, 1>,
+
+		cryptonight_core_gpu_hash<cryptonight_r_wow, 0>,
+		cryptonight_core_gpu_hash<cryptonight_r_wow, 1>,
+
+		cryptonight_core_gpu_hash<cryptonight_r, 0>,
+		cryptonight_core_gpu_hash<cryptonight_r, 1>
 	};
 
 	std::bitset<1> digit;
 	digit.set(0, ctx->memMode == 1);
 
 	cuda_hash_fn selected_function = func_table[ ((miner_algo - 1u) << 1) | digit.to_ulong() ];
-	selected_function(ctx, startNonce);
+	selected_function(ctx, startNonce, miner_algo);
+
 }
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
new file mode 100644
index 000000000..fee7e13d1
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_gpu.hpp
@@ -0,0 +1,564 @@
+#pragma once
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <cstdint>
+
+#include "cuda_keccak.hpp"
+#include "cuda_extra.hpp"
+
+namespace xmrstak
+{
+namespace nvidia
+{
+
+struct __m128i : public int4
+{
+
+	__forceinline__ __device__ __m128i(){}
+
+	__forceinline__ __device__ __m128i(
+		const uint32_t x0, const uint32_t x1,
+		const uint32_t x2, const uint32_t x3)
+	{
+		x = x0;
+		y = x1;
+		z = x2;
+		w = x3;
+	}
+
+	__forceinline__ __device__ __m128i( const int x0)
+	{
+		x = x0;
+		y = x0;
+		z = x0;
+		w = x0;
+	}
+
+	__forceinline__ __device__ __m128i operator|(const __m128i& other)
+	{
+		return __m128i(
+			x | other.x,
+			y | other.y,
+			z | other.z,
+			w | other.w
+		);
+	}
+
+	__forceinline__ __device__ __m128i operator^(const __m128i& other)
+	{
+		return __m128i(
+			x ^ other.x,
+			y ^ other.y,
+			z ^ other.z,
+			w ^ other.w
+		);
+	}
+};
+
+struct __m128 : public float4
+{
+
+	__forceinline__ __device__ __m128(){}
+
+	__forceinline__ __device__ __m128(
+		const float x0, const float x1,
+		const float x2, const float x3)
+	{
+		float4::x = x0;
+		float4::y = x1;
+		float4::z = x2;
+		float4::w = x3;
+	}
+
+	__forceinline__ __device__ __m128( const float x0)
+	{
+		float4::x = x0;
+		float4::y = x0;
+		float4::z = x0;
+		float4::w = x0;
+	}
+
+	__forceinline__ __device__ __m128( const __m128i& x0)
+	{
+		float4::x = int2float(x0.x);
+		float4::y = int2float(x0.y);
+		float4::z = int2float(x0.z);
+		float4::w = int2float(x0.w);
+	}
+
+	__forceinline__ __device__ __m128i get_int( )
+	{
+		return __m128i(
+			(int)x,
+			(int)y,
+			(int)z,
+			(int)w
+		);
+	}
+
+	__forceinline__ __device__ __m128 operator+(const __m128& other)
+	{
+		return __m128(
+			x + other.x,
+			y + other.y,
+			z + other.z,
+			w + other.w
+		);
+	}
+
+	__forceinline__ __device__ __m128 operator-(const __m128& other)
+	{
+		return __m128(
+			x - other.x,
+			y - other.y,
+			z - other.z,
+			w - other.w
+		);
+	}
+
+	__forceinline__ __device__ __m128 operator*(const __m128& other)
+	{
+		return __m128(
+			x * other.x,
+			y * other.y,
+			z * other.z,
+			w * other.w
+		);
+	}
+
+	__forceinline__ __device__ __m128 operator/(const __m128& other)
+	{
+		return __m128(
+			x / other.x,
+			y / other.y,
+			z / other.z,
+			w / other.w
+		);
+	}
+
+	__forceinline__ __device__ __m128& trunc()
+	{
+		x=::truncf(x);
+		y=::truncf(y);
+		z=::truncf(z);
+		w=::truncf(w);
+
+		return *this;
+	}
+
+	__forceinline__ __device__ __m128& abs()
+	{
+		x=::fabsf(x);
+		y=::fabsf(y);
+		z=::fabsf(z);
+		w=::fabsf(w);
+
+		return *this;
+	}
+
+	__forceinline__ __device__ __m128& floor()
+	{
+		x=::floorf(x);
+		y=::floorf(y);
+		z=::floorf(z);
+		w=::floorf(w);
+
+		return *this;
+	}
+};
+
+
+template<typename T>
+__device__ void print(const char* name, T value)
+{
+	printf("g %s: ", name);
+	for(int i = 0; i < 4; ++i)
+	{
+		printf("%08X ",((uint32_t*)&value)[i]);
+	}
+	printf("\n");
+}
+
+template<>
+__device__ void print<__m128>(const char* name, __m128 value)
+{
+	printf("g %s: ", name);
+	for(int i = 0; i < 4; ++i)
+	{
+		printf("%f ",((float*)&value)[i]);
+	}
+	printf("\n");
+}
+
+#define SHOW(name) print(#name, name)
+
+
+__forceinline__ __device__ __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+	return a + b;
+}
+
+__forceinline__ __device__ __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+	return a - b;
+}
+
+__forceinline__ __device__ __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+	return a * b;
+}
+
+__forceinline__ __device__ __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+	return a / b;
+}
+
+__forceinline__ __device__ __m128 _mm_and_ps(__m128 a, int b)
+{
+	return __m128(
+		int_as_float(float_as_int(a.x) & b),
+		int_as_float(float_as_int(a.y) & b),
+		int_as_float(float_as_int(a.z) & b),
+		int_as_float(float_as_int(a.w) & b)
+	);
+}
+
+__forceinline__ __device__ __m128 _mm_or_ps(__m128 a, int b)
+{
+	return __m128(
+		int_as_float(float_as_int(a.x) | b),
+		int_as_float(float_as_int(a.y) | b),
+		int_as_float(float_as_int(a.z) | b),
+		int_as_float(float_as_int(a.w) | b)
+	);
+}
+
+__forceinline__ __device__ __m128 _mm_xor_ps(__m128 a, int b)
+{
+	return __m128(
+		int_as_float(float_as_int(a.x) ^ b),
+		int_as_float(float_as_int(a.y) ^ b),
+		int_as_float(float_as_int(a.z) ^ b),
+		int_as_float(float_as_int(a.w) ^ b)
+	);
+}
+
+__forceinline__ __device__ __m128 _mm_fmod_ps(__m128 v, float dc)
+{
+	__m128 d(dc);
+	__m128 c = _mm_div_ps(v, d);
+	c.trunc();//_mm_round_ps(c, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+	// c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c)); - sse2
+	c = _mm_mul_ps(c, d);
+	return _mm_sub_ps(v, c);
+
+
+	//return a.fmodf(b);
+}
+
+__forceinline__ __device__ __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+	return a ^ b;
+}
+
+
+__forceinline__ __device__ __m128i _mm_alignr_epi8(__m128i a, const uint32_t rot)
+{
+	const uint32_t right = 8 * rot;
+	const uint32_t left = (32 - 8 * rot);
+	return __m128i(
+		((uint32_t)a.x >> right) | ( a.y << left ),
+		((uint32_t)a.y >> right) | ( a.z << left ),
+		((uint32_t)a.z >> right) | ( a.w << left ),
+		((uint32_t)a.w >> right) | ( a.x << left )
+	);
+}
+
+__device__ __m128i* scratchpad_ptr(uint32_t idx, uint32_t n, int *lpad, const uint32_t MASK) { return (__m128i*)((uint8_t*)lpad + (idx & MASK) + n * 16); }
+
+
+__forceinline__ __device__  __m128 fma_break(__m128 x)
+{
+	// Break the dependency chain by setitng the exp to ?????01
+	x = _mm_and_ps(x, 0xFEFFFFFF);
+	return _mm_or_ps(x, 0x00800000);
+}
+
+// 9
+__forceinline__ __device__ void sub_round(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& n, __m128& d, __m128& c)
+{
+	n1 = _mm_add_ps(n1, c);
+	__m128 nn = _mm_mul_ps(n0, c);
+	nn = _mm_mul_ps(n1, _mm_mul_ps(nn,nn));
+	nn = fma_break(nn);
+	n = _mm_add_ps(n, nn);
+
+	n3 = _mm_sub_ps(n3, c);
+	__m128 dd = _mm_mul_ps(n2, c);
+	dd = _mm_mul_ps(n3, _mm_mul_ps(dd,dd));
+	dd = fma_break(dd);
+	d = _mm_add_ps(d, dd);
+
+	//Constant feedback
+	c = _mm_add_ps(c, rnd_c);
+	c = _mm_add_ps(c, 0.734375f);
+	__m128 r = _mm_add_ps(nn, dd);
+	r = _mm_and_ps(r, 0x807FFFFF);
+	r = _mm_or_ps(r, 0x40000000);
+	c = _mm_add_ps(c, r);
+}
+
+// 9*8 + 2 = 74
+__forceinline__ __device__ void round_compute(__m128 n0, __m128 n1, __m128 n2, __m128 n3, __m128 rnd_c, __m128& c, __m128& r)
+{
+	__m128 n(0.0f), d(0.0f);
+
+	sub_round(n0, n1, n2, n3, rnd_c, n, d, c);
+	sub_round(n1, n2, n3, n0, rnd_c, n, d, c);
+	sub_round(n2, n3, n0, n1, rnd_c, n, d, c);
+	sub_round(n3, n0, n1, n2, rnd_c, n, d, c);
+	sub_round(n3, n2, n1, n0, rnd_c, n, d, c);
+	sub_round(n2, n1, n0, n3, rnd_c, n, d, c);
+	sub_round(n1, n0, n3, n2, rnd_c, n, d, c);
+	sub_round(n0, n3, n2, n1, rnd_c, n, d, c);
+
+	// Make sure abs(d) > 2.0 - this prevents division by zero and accidental overflows by division by < 1.0
+	d = _mm_and_ps(d, 0xFF7FFFFF);
+	d = _mm_or_ps(d, 0x40000000);
+	r =_mm_add_ps(r, _mm_div_ps(n,d));
+}
+
+// 74*8 = 595
+__forceinline__ __device__ __m128i single_comupte(__m128 n0, __m128 n1, __m128 n2, __m128 n3, float cnt, __m128 rnd_c, __m128& sum)
+{
+	__m128 c(cnt);
+	// 35 maths calls follow (140 FLOPS)
+	__m128 r = __m128(0.0f);
+	for(int i=0; i< 4; ++i)
+		round_compute(n0, n1, n2, n3, rnd_c, c, r);
+	// do a quick fmod by setting exp to 2
+	r = _mm_and_ps(r, 0x807FFFFF);
+	r = _mm_or_ps(r, 0x40000000);
+	sum = r; // 34
+	r = _mm_mul_ps(r, __m128(536870880.0f)); // 35
+	return r.get_int();
+
+}
+
+__forceinline__ __device__ void single_comupte_wrap(const uint32_t rot, const __m128i& v0, const __m128i& v1, const __m128i& v2, const __m128i& v3, float cnt, __m128 rnd_c, __m128& sum, __m128i& out)
+{
+	__m128 n0(v0);
+	__m128 n1(v1);
+	__m128 n2(v2);
+	__m128 n3(v3);
+
+	__m128i r = single_comupte(n0, n1, n2, n3, cnt, rnd_c, sum);
+	out = rot == 0 ? r : _mm_alignr_epi8(r, rot);
+}
+
+__constant__ uint32_t look[16][4] = {
+	{0, 1, 2, 3},
+	{0, 2, 3, 1},
+	{0, 3, 1, 2},
+	{0, 3, 2, 1},
+
+	{1, 0, 2, 3},
+	{1, 2, 3, 0},
+	{1, 3, 0, 2},
+	{1, 3, 2, 0},
+
+	{2, 1, 0, 3},
+	{2, 0, 3, 1},
+	{2, 3, 1, 0},
+	{2, 3, 0, 1},
+
+	{3, 1, 2, 0},
+	{3, 2, 0, 1},
+	{3, 0, 1, 2},
+	{3, 0, 2, 1}
+};
+
+__constant__ float ccnt[16] = {
+	1.34375f,
+	1.28125f,
+	1.359375f,
+	1.3671875f,
+
+	1.4296875f,
+	1.3984375f,
+	1.3828125f,
+	1.3046875f,
+
+	1.4140625f,
+	1.2734375f,
+	1.2578125f,
+	1.2890625f,
+
+	1.3203125f,
+	1.3515625f,
+	1.3359375f,
+	1.4609375f
+};
+
+
+__forceinline__ __device__ void sync()
+{
+#if (__CUDACC_VER_MAJOR__ >= 9)
+	__syncwarp();
+#else
+	__syncthreads( );
+#endif
+}
+
+struct SharedMemChunk
+{
+	__m128i out[16];
+	__m128 va[16];
+};
+
+__global__ void cryptonight_core_gpu_phase2_gpu(
+	const uint32_t ITERATIONS,  const size_t MEMORY, const uint32_t MASK,
+	int32_t *spad, int *lpad_in, int bfactor, int partidx, uint32_t * roundVs, uint32_t * roundS)
+{
+
+	const int batchsize = (ITERATIONS * 2) >> ( 1 + bfactor );
+
+	extern __shared__ SharedMemChunk smemExtern_in[];
+
+	const uint32_t chunk = threadIdx.x / 16;
+	const uint32_t numHashPerBlock = blockDim.x / 16;
+
+	int* lpad = (int*)((uint8_t*)lpad_in + size_t(MEMORY) * (blockIdx.x * numHashPerBlock + chunk));
+
+	SharedMemChunk* smem = smemExtern_in + chunk;
+
+	uint32_t tid = threadIdx.x % 16;
+
+	const uint32_t idxHash = blockIdx.x * numHashPerBlock + threadIdx.x/16;
+	uint32_t s = 0;
+
+	__m128 vs(0);
+	if(partidx != 0)
+	{
+		vs = ((__m128*)roundVs)[idxHash];
+		s = roundS[idxHash];
+	}
+	else
+	{
+		s = ((uint32_t*)spad)[idxHash * 50] >> 8;
+	}
+
+	// tid divided
+	const uint32_t tidd = tid / 4;
+	// tid modulo
+	const uint32_t tidm = tid % 4;
+	const uint32_t block = tidd * 16 + tidm;
+
+	for(size_t i = 0; i < batchsize; i++)
+	{
+		sync();
+		int tmp = ((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm];
+		((int*)smem->out)[tid] = tmp;
+		sync();
+
+		__m128 rc = vs;
+		single_comupte_wrap(
+			tidm,
+			*(smem->out + look[tid][0]),
+			*(smem->out + look[tid][1]),
+			*(smem->out + look[tid][2]),
+			*(smem->out + look[tid][3]),
+			ccnt[tid], rc, smem->va[tid],
+			smem->out[tid]
+		);
+
+		sync();
+
+		int outXor = ((int*)smem->out)[block];
+		for(uint32_t dd = block + 4; dd < (tidd + 1) * 16; dd += 4)
+			outXor ^= ((int*)smem->out)[dd];
+
+		((int*)scratchpad_ptr(s, tidd, lpad, MASK))[tidm] = outXor ^ tmp;
+		((int*)smem->out)[tid] = outXor;
+
+		float va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4];
+		float va_tmp2 = ((float*)smem->va)[block+ 8] + ((float*)smem->va)[block + 12];
+		((float*)smem->va)[tid] = va_tmp1 + va_tmp2;
+
+		sync();
+
+		__m128i out2 = smem->out[0] ^ smem->out[1] ^ smem->out[2] ^ smem->out[3];
+		va_tmp1 = ((float*)smem->va)[block] + ((float*)smem->va)[block + 4];
+		va_tmp2 = ((float*)smem->va)[block + 8] + ((float*)smem->va)[block + 12];
+		((float*)smem->va)[tid] = va_tmp1 + va_tmp2;
+
+		sync();
+
+		vs = smem->va[0];
+		vs.abs(); // take abs(va) by masking the float sign bit
+		auto xx = _mm_mul_ps(vs, __m128(16777216.0f));
+		// vs range 0 - 64
+		auto xx_int = xx.get_int();
+		out2 = _mm_xor_si128(xx_int, out2);
+		// vs is now between 0 and 1
+		vs = _mm_div_ps(vs, __m128(64.0f));
+		s = out2.x ^ out2.y ^ out2.z ^ out2.w;
+	}
+	if(partidx != ((1<<bfactor) - 1) && threadIdx.x % 16 == 0)
+	{
+		const uint32_t numHashPerBlock2 = blockDim.x / 16;
+		const uint32_t idxHash2 = blockIdx.x * numHashPerBlock2 + threadIdx.x/16;
+		((__m128*)roundVs)[idxHash2] = vs;
+		roundS[idxHash2] = s;
+	}
+}
+
+__forceinline__ __device__ void generate_512(uint64_t idx, const uint64_t* in, uint8_t* out)
+{
+	uint64_t hash[25];
+
+	hash[0] = in[0] ^ idx;
+	#pragma unroll 24
+	for(int i = 1; i < 25; ++i)
+		hash[i] = in[i];
+
+	cn_keccakf2(hash);
+	#pragma unroll 10
+	for(int i = 0; i < 10; ++i)
+		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
+	out+=160;
+
+	cn_keccakf2(hash);
+	#pragma unroll 11
+	for(int i = 0; i < 11; ++i)
+		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
+	out+=176;
+
+	cn_keccakf2(hash);
+	#pragma unroll 11
+	for(int i = 0; i < 11; ++i)
+		((ulonglong2*)out)[i] = ((ulonglong2*)hash)[i];
+}
+
+
+__global__ void cn_explode_gpu(const size_t MEMORY, int32_t *spad_in, int *lpad_in)
+{
+	__shared__ uint64_t state[25];
+
+	uint8_t* lpad = (uint8_t*)lpad_in + blockIdx.x * MEMORY;
+	uint64_t* spad = (uint64_t*)((uint8_t*)spad_in + blockIdx.x * 200);
+
+	for(int i = threadIdx.x; i < 25; i += blockDim.x)
+		state[i] = spad[i];
+
+	sync();
+
+	for(uint64_t i = threadIdx.x; i < MEMORY / 512; i+=blockDim.x)
+	{
+		generate_512(i, state, (uint8_t*)lpad + i*512);
+	}
+}
+
+} // namespace xmrstak
+} // namespace nvidia
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt
new file mode 100644
index 000000000..bcf495080
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_cryptonight_r.curt
@@ -0,0 +1,618 @@
+R"===(
+
+/*
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#define cryptonight_r_wow 15
+#define cryptonight_r 16
+
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long int uint64_t;
+
+static __constant__ uint32_t d_t_fn[1024] =
+{
+	0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U,
+	0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U,
+	0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U,
+	0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU,
+	0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU,
+	0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU,
+	0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U,
+	0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU,
+	0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU,
+	0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U,
+	0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U,
+	0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU,
+	0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU,
+	0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU,
+	0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU,
+	0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU,
+	0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U,
+	0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU,
+	0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU,
+	0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U,
+	0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U,
+	0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U,
+	0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U,
+	0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U,
+	0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU,
+	0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U,
+	0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU,
+	0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU,
+	0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U,
+	0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U,
+	0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U,
+	0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU,
+	0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U,
+	0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU,
+	0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU,
+	0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U,
+	0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U,
+	0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU,
+	0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U,
+	0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU,
+	0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U,
+	0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U,
+	0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U,
+	0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U,
+	0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU,
+	0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U,
+	0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU,
+	0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U,
+	0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU,
+	0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U,
+	0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU,
+	0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU,
+	0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU,
+	0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU,
+	0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U,
+	0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U,
+	0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U,
+	0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U,
+	0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U,
+	0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U,
+	0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU,
+	0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U,
+	0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU,
+	0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU,
+	0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU,
+	0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U,
+	0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU,
+	0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU,
+	0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U,
+	0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU,
+	0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU,
+	0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU,
+	0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU,
+	0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU,
+	0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U,
+	0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU,
+	0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU,
+	0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U,
+	0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU,
+	0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU,
+	0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU,
+	0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU,
+	0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU,
+	0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U,
+	0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU,
+	0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU,
+	0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU,
+	0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU,
+	0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U,
+	0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U,
+	0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U,
+	0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U,
+	0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU,
+	0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U,
+	0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U,
+	0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU,
+	0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU,
+	0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U,
+	0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U,
+	0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U,
+	0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU,
+	0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U,
+	0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU,
+	0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U,
+	0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU,
+	0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U,
+	0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U,
+	0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU,
+	0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U,
+	0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U,
+	0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U,
+	0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U,
+	0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U,
+	0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U,
+	0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U,
+	0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U,
+	0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU,
+	0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U,
+	0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U,
+	0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U,
+	0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U,
+	0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U,
+	0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U,
+	0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU,
+	0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U,
+	0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U,
+	0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U,
+	0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU,
+	0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU,
+	0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U,
+	0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU,
+	0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U,
+	0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU,
+	0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U,
+	0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU,
+	0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U,
+	0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U,
+	0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU,
+	0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U,
+	0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U,
+	0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U,
+	0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU,
+	0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U,
+	0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U,
+	0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU,
+	0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U,
+	0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U,
+	0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U,
+	0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU,
+	0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU,
+	0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U,
+	0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU,
+	0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU,
+	0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U,
+	0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU,
+	0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U,
+	0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU,
+	0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U,
+	0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U,
+	0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U,
+	0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU,
+	0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U,
+	0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU,
+	0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U,
+	0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU,
+	0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U,
+	0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U,
+	0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU,
+	0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU,
+	0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU,
+	0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U,
+	0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U,
+	0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU,
+	0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U,
+	0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU,
+	0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U,
+	0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU,
+	0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U,
+	0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU,
+	0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU,
+	0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U,
+	0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU,
+	0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U,
+	0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU,
+	0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U,
+	0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U,
+	0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U,
+	0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU,
+	0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU,
+	0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U,
+	0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU,
+	0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U,
+	0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU,
+	0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U,
+	0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU,
+	0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U,
+	0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU,
+	0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U,
+	0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU,
+	0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U,
+	0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U,
+	0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU,
+	0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U,
+	0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U,
+	0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U,
+	0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU,
+	0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U,
+	0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U,
+	0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU,
+	0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U,
+	0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U,
+	0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U,
+	0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU,
+	0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU,
+	0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U,
+	0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU,
+	0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU,
+	0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U,
+	0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU,
+	0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U,
+	0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU,
+	0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U,
+	0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U,
+	0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U,
+	0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU,
+	0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U,
+	0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU,
+	0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U,
+	0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU,
+	0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U,
+	0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U,
+	0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU,
+	0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU,
+	0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU,
+	0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U,
+	0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U,
+	0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU,
+	0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U,
+	0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU,
+	0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U,
+	0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU,
+	0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U,
+	0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU,
+	0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU,
+	0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U,
+	0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU,
+	0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U,
+	0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU,
+	0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U,
+	0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U,
+	0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U,
+	0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU,
+	0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU,
+	0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U,
+	0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU,
+	0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U
+};
+
+#define t_fn0(x) (sharedMemory[      (x)])
+#define t_fn1(x) (sharedMemory[256 + (x)])
+#define t_fn2(x) (sharedMemory[512 + (x)])
+#define t_fn3(x) (sharedMemory[768 + (x)])
+
+__device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory)
+{
+	for(int i = threadIdx.x; i < 1024; i += blockDim.x)
+		sharedMemory[i] = d_t_fn[i];
+}
+
+)==="
+R"===(
+
+template< typename T >
+__forceinline__ __device__ void unusedVar( const T& )
+{
+}
+
+template<size_t group_n>
+__forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_t sub,const int val,const uint32_t src)
+{
+#   if ( __CUDA_ARCH__ < 300 )
+    ptr[sub] = val;
+    return ptr[src & (group_n-1)];
+#   else
+    unusedVar( ptr );
+    unusedVar( sub );
+#   if (__CUDACC_VER_MAJOR__ >= 9)
+    return __shfl_sync(__activemask(), val, src, group_n);
+#   else
+    return __shfl( val, src, group_n );
+#   endif
+#   endif
+}
+
+
+template<size_t group_n>
+__forceinline__ __device__ uint64_t shuffle64(volatile uint32_t* ptr,const uint32_t sub,const uint64_t val,const uint32_t src, const uint32_t src2)
+{
+    uint64_t tmp;
+    ((uint32_t*)&tmp)[0] = shuffle<group_n>(ptr, sub, static_cast<uint32_t>(val), src);
+    ((uint32_t*)&tmp)[1] = shuffle<group_n>(ptr, sub, static_cast<uint32_t>(val >> 32), src2);
+    return tmp;
+}
+
+struct u64 : public uint2
+{
+
+    __forceinline__ __device__ u64(){}
+
+    __forceinline__ __device__ u64( const uint32_t x0, const uint32_t x1)
+    {
+        uint2::x = x0;
+        uint2::y = x1;
+    }
+
+    __forceinline__ __device__ operator uint64_t() const
+    {
+        return *((uint64_t*)this);
+    }
+
+    __forceinline__ __device__ u64( const uint64_t x0)
+    {
+        ((uint64_t*)&this->x)[0] = x0;
+    }
+
+    __forceinline__ __device__ u64 operator^=(const u64& other)
+    {
+        uint2::x ^= other.x;
+        uint2::y ^= other.y;
+
+        return *this;
+    }
+
+    __forceinline__ __device__ u64 operator^=(const uint64_t& other)
+    {
+        uint2::x ^= static_cast<uint32_t>(other);
+        uint2::y ^= static_cast<uint32_t>(other >> 32);
+
+        return *this;
+    }
+
+    __forceinline__ __device__ u64 operator+(const u64& other) const
+    {
+        u64 tmp;
+        ((uint64_t*)&tmp.x)[0] = ((uint64_t*)&(this->x))[0] + ((uint64_t*)&(other.x))[0];
+
+        return tmp;
+    }
+
+    __forceinline__ __device__ u64 operator+=(const uint64_t& other)
+    {
+        return ((uint64_t*)&this->x)[0] += other;
+    }
+};
+
+#ifdef RANDOM_MATH_64_BIT
+
+__device__ __forceinline__ static uint64_t rotate_left(uint64_t a, uint64_t b)
+{
+    const int shift = b & 63;
+    return (a << shift) | (a >> (64 - shift));
+}
+
+__device__ __forceinline__ static uint64_t rotate_right(uint64_t a, uint64_t b)
+{
+    const int shift = b & 63;
+    return (a >> shift) | (a << (64 - shift));
+}
+
+#else
+
+__device__ __forceinline__ static uint32_t rotate_left(uint32_t a, uint32_t b) { 
+#if __CUDA_ARCH__ < 350
+    const uint32_t shift = b & 31;
+    return (a << shift) | (a >> (32 - shift));
+#else
+    return __funnelshift_l(a, a, b); 
+#endif
+}
+__device__ __forceinline__ static uint32_t rotate_right(uint32_t a, uint32_t b) { 
+#if __CUDA_ARCH__ < 350
+    const uint32_t shift = b & 31;
+    return (a >> shift) | (a << (32 - shift));
+#else
+    return __funnelshift_r(a, a, b);
+#endif
+}
+
+#endif
+
+__global__ void CryptonightR_phase2(
+		const uint32_t ITERATIONS,
+		const size_t MEMORY,
+		const uint32_t MASK,
+        int threads,
+        int bfactor,
+        int partidx,
+        uint32_t *d_long_state,
+        uint32_t *d_ctx_a,
+        uint32_t *d_ctx_b,
+        uint32_t * d_ctx_state,
+        uint32_t startNonce,
+        uint32_t * __restrict__ d_input
+        )
+{
+    __shared__ uint32_t sharedMemory[1024];
+
+    cn_aes_gpu_init( sharedMemory );
+
+#   if( __CUDA_ARCH__ < 300 )
+    extern __shared__ uint64_t externShared[];
+    // 8 x 64bit values
+    volatile uint64_t* myChunks = (volatile uint64_t*)(externShared + (threadIdx.x >> 1) * 8);
+    volatile uint32_t* sPtr = (volatile uint32_t*)(externShared + (blockDim.x >> 1) * 8)  + (threadIdx.x & 0xFFFFFFFE);
+#   else
+    extern __shared__ uint64_t chunkMem[];
+    volatile uint32_t* sPtr = NULL;
+    // 8 x 64bit values
+    volatile uint64_t* myChunks = (volatile uint64_t*)(chunkMem + (threadIdx.x >> 1) * 8);
+#   endif
+
+    __syncthreads( );
+
+    const uint64_t tid    = (blockDim.x * blockIdx.x + threadIdx.x);
+    const uint32_t thread = tid >> 1;
+    const uint32_t sub    = tid & 1;
+
+    if (thread >= threads) {
+        return;
+    }
+
+    uint8_t *l0              = (uint8_t*)&d_long_state[((uint64_t)thread) * MEMORY];
+    uint64_t ax0             = ((uint64_t*)(d_ctx_a + thread * 4))[sub];
+    uint32_t idx0            = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
+    uint64_t bx0             = ((uint64_t*)(d_ctx_b + thread * 16))[sub];
+    uint64_t bx1             = ((uint64_t*)(d_ctx_b + thread * 16 + 4))[sub];
+
+    uint32_t r0 = d_ctx_b[thread * 16 + 4 * 2];
+    uint32_t r1 = d_ctx_b[thread * 16 + 4 * 2 + 1];
+    uint32_t r2 = d_ctx_b[thread * 16 + 4 * 2 + 2];
+    uint32_t r3 = d_ctx_b[thread * 16 + 4 * 2 + 3];
+
+    const int batchsize      = (ITERATIONS * 2) >> ( 1 + bfactor );
+    const int start          = partidx * batchsize;
+    const int end            = start + batchsize;
+
+    uint64_t* ptr0;
+    for (int i = start; i < end; ++i) {
+        ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
+
+        ((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub];
+
+        uint32_t idx1 = (idx0 & 0x30) >> 3;
+        const u64 cx  = myChunks[ idx1 + sub ];
+        const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ];
+
+        u64 cx_aes = ax0 ^ u64(
+            t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ),
+            t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) )
+        );
+
+        {
+            const uint64_t chunk1 = myChunks[idx1 ^ 2 + sub];
+            const uint64_t chunk2 = myChunks[idx1 ^ 4 + sub];
+            const uint64_t chunk3 = myChunks[idx1 ^ 6 + sub];
+
+#if(ALGO == cryptonight_r)
+            cx_aes ^= chunk1 ^ chunk2 ^ chunk3;
+#endif
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+            __syncwarp();
+#else
+            __syncthreads();
+#endif
+
+            myChunks[idx1 ^ 2 + sub] = chunk3 + bx1;
+            myChunks[idx1 ^ 4 + sub] = chunk1 + bx0;
+            myChunks[idx1 ^ 6 + sub] = chunk2 + ax0;
+        }
+
+        myChunks[idx1 + sub] = cx_aes ^ bx0;
+
+        ((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub];
+
+        idx0 = shuffle<2>(sPtr, sub, cx_aes.x, 0);
+        idx1 = (idx0 & 0x30) >> 3;
+        ptr0 = (uint64_t *)&l0[idx0 & MASK & 0x1FFFC0];
+
+        ((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub];
+
+        uint64_t cx_mul;
+        ((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0);
+        ((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0);
+
+        const uint32_t r4 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
+        const uint32_t r6 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(bx0), 0);
+        const uint32_t r7 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(bx1), 0);
+
+        const uint64_t ax0_saved = ax0;
+
+        if (sub == 1) 
+        {
+            ((uint32_t*)&myChunks[idx1])[0] ^= r0 + r1;
+            ((uint32_t*)&myChunks[idx1])[1] ^= r2 + r3;
+
+            const uint32_t r5 = static_cast<uint32_t>(ax0);
+#if(ALGO == cryptonight_r)
+            const uint32_t r8 = static_cast<uint32_t>(bx1);
+#endif
+
+            XMRSTAK_INCLUDE_RANDOM_MATH
+        }
+
+#if(ALGO == cryptonight_r)
+        r0 = shuffle<2>(sPtr, sub, r0, 1);
+        r1 = shuffle<2>(sPtr, sub, r1, 1);
+        r2 = shuffle<2>(sPtr, sub, r2, 1);
+        r3 = shuffle<2>(sPtr, sub, r3, 1);
+        ax0 ^= (sub == 0) ? (r2 | ((uint64_t)(r3) << 32)) : (r0 | ((uint64_t)(r1) << 32));
+#endif
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+        __syncwarp();
+#else
+        __syncthreads( );
+#endif
+
+        uint64_t c = ((uint64_t*)myChunks)[idx1 + sub];
+
+        {
+            uint64_t cl = ((uint64_t*)myChunks)[idx1];
+            // sub 0 -> hi, sub 1 -> lo
+            uint64_t res = sub == 0 ? __umul64hi( cx_mul, cl ) : cx_mul * cl;
+
+            const uint64_t chunk1 = myChunks[ idx1 ^ 2 + sub ]
+#if(ALGO == cryptonight_r_wow)
+            ^ res
+#endif
+            ;
+            uint64_t chunk2       = myChunks[ idx1 ^ 4 + sub ];
+#if(ALGO == cryptonight_r_wow)
+            res ^= ((uint64_t*)&chunk2)[0];
+#endif
+            const uint64_t chunk3 = myChunks[ idx1 ^ 6 + sub ];
+
+#if(ALGO == cryptonight_r)
+            cx_aes ^= chunk1 ^ chunk2 ^ chunk3;
+#endif
+
+#           if (__CUDACC_VER_MAJOR__ >= 9)
+            __syncwarp();
+#           else
+            __syncthreads( );
+#           endif
+
+            myChunks[idx1 ^ 2 + sub] = chunk3 + bx1;
+            myChunks[idx1 ^ 4 + sub] = chunk1 + bx0;
+            myChunks[idx1 ^ 6 + sub] = chunk2 + ax0_saved;
+
+            ax0 += res;
+        }
+
+        bx1 = bx0;
+        bx0 = cx_aes;
+
+        myChunks[idx1 + sub] = ax0;
+
+        ((ulonglong4*)ptr0)[sub] = ((ulonglong4*)myChunks)[sub];
+
+        ax0 ^= c;
+        idx0 = shuffle<2>(sPtr, sub, static_cast<uint32_t>(ax0), 0);
+    }
+
+    if (bfactor > 0) 
+    {
+        ((uint64_t*)(d_ctx_a + thread * 4))[sub]      = ax0;
+        ((uint64_t*)(d_ctx_b + thread * 16))[sub]     = bx0;
+        ((uint64_t*)(d_ctx_b + thread * 16 + 4))[sub] = bx1;
+
+        if (sub == 1) 
+        {
+            // must be valid only for `sub == 1`
+            d_ctx_b[thread * 16 + 4 * 2] = r0;
+            d_ctx_b[thread * 16 + 4 * 2 + 1] = r1;
+            d_ctx_b[thread * 16 + 4 * 2 + 2] = r2;
+            d_ctx_b[thread * 16 + 4 * 2 + 3] = r3;
+        }
+    }
+}
+)==="
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
index 563bb3b9e..96cb679f5 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_device.hpp
@@ -22,6 +22,18 @@
 } \
 ( (void) 0 )
 
+#define CU_CHECK(id, ...) {                                                                             \
+    CUresult result = __VA_ARGS__;                                                                      \
+    if(result != CUDA_SUCCESS){                                                                         \
+        const char* s;                                                                                  \
+        cuGetErrorString(result, &s);                                                                   \
+        std::cerr << "[CUDA] Error gpu " << id << ": <" << __FUNCTION__ << ">:" << __LINE__ << " \"" << (s ? s : "unknown error") << "\"" << std::endl; \
+        throw std::runtime_error(std::string("[CUDA] Error: ") + std::string(s ? s : "unknown error")); \
+    }                                                                                                   \
+}                                                                                                       \
+( (void) 0 )
+
+
 /** execute and check a CUDA api command
  *
  * @param id gpu id (thread id)
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 45afec9ac..7a9ccddc2 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -9,21 +9,6 @@
 #include  <algorithm>
 #include "xmrstak/jconf.hpp"
 
-#ifdef __CUDACC__
-__constant__
-#else
-const
-#endif
-uint64_t keccakf_rndc[24] ={
-	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
-	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
-	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
-	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
-	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
-	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
-	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
-	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
-};
 
 typedef unsigned char BitSequence;
 typedef unsigned long long DataLength;
@@ -108,7 +93,7 @@ __device__ __forceinline__ void mix_and_propagate( uint32_t* state )
 		(state + 4 * 7)[x] = (state + 4 * 7)[x] ^ tmp0[x];
 }
 
-template<xmrstak_algo ALGO>
+template<xmrstak_algo_id ALGO>
 __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restrict__ d_input, uint32_t len, uint32_t startNonce, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_state2, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2 )
 {
 	int thread = ( blockDim.x * blockIdx.x + threadIdx.x );
@@ -144,14 +129,23 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
 	if(ALGO == cryptonight_monero_v8)
 	{
-		memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 );
+		memcpy( d_ctx_b + thread * 16, ctx_b, 4 * 4 );
 		// bx1
 		XOR_BLOCKS_DST( ctx_state + 16, ctx_state + 20, ctx_b );
-		memcpy( d_ctx_b + thread * 12 + 4, ctx_b, 4 * 4 );
+		memcpy( d_ctx_b + thread * 16 + 4, ctx_b, 4 * 4 );
 		// division_result
-		memcpy( d_ctx_b + thread * 12 + 2 * 4, ctx_state + 24, 4 * 2 );
+		memcpy( d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 2 );
 		// sqrt_result
-		memcpy( d_ctx_b + thread * 12 + 2 * 4 + 2, ctx_state + 26, 4 * 2 );
+		memcpy( d_ctx_b + thread * 16 + 2 * 4 + 2, ctx_state + 26, 4 * 2 );
+	}
+	else if(ALGO == cryptonight_r_wow || ALGO == cryptonight_r)
+	{
+		memcpy(d_ctx_b + thread * 16, ctx_b, 4 * 4);
+		// bx1
+		XOR_BLOCKS_DST(ctx_state + 16, ctx_state + 20, ctx_b);
+		memcpy(d_ctx_b + thread * 16 + 4, ctx_b, 4 * 4);
+		// r0, r1, r2, r3
+		memcpy(d_ctx_b + thread * 16 + 2 * 4, ctx_state + 24, 4 * 8);
 	}
 	else
 		memcpy( d_ctx_b + thread * 4, ctx_b, 4 * 4 );
@@ -177,14 +171,15 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	}
 }
 
-template<xmrstak_algo ALGO>
+template<xmrstak_algo_id ALGO>
 __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint32_t* __restrict__ d_res_count, uint32_t * __restrict__ d_res_nonce, uint32_t * __restrict__ d_ctx_state,uint32_t * __restrict__ d_ctx_key2 )
 {
 	const int thread = blockDim.x * blockIdx.x + threadIdx.x;
 
 	__shared__ uint32_t sharedMemory[1024];
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+	if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
+		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		cn_aes_gpu_init( sharedMemory );
 		__syncthreads( );
@@ -201,7 +196,8 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 	for ( i = 0; i < 50; i++ )
 		state[i] = ctx_state[i];
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
+	if(ALGO == cryptonight_gpu || ALGO == cryptonight_heavy || ALGO == cryptonight_haven ||
+		ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		uint32_t key[40];
 
@@ -220,33 +216,46 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 	}
 	cn_keccakf2( (uint64_t *) state );
 
-	switch ( ( (uint8_t *) state )[0] & 0x03 )
+	if(ALGO == cryptonight_gpu)
 	{
-	case 0:
-		cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash );
-		break;
-	case 1:
-		cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash );
-		break;
-	case 2:
-		cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash );
-		break;
-	case 3:
-		cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash );
-		break;
-	default:
-		break;
+		if ( ((uint64_t*)state)[3] < target )
+		{
+			uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
+
+			if(idx < 10)
+				d_res_nonce[idx] = thread;
+		}
 	}
+	else
+	{
+		switch ( ( (uint8_t *) state )[0] & 0x03 )
+		{
+		case 0:
+			cn_blake( (const uint8_t *) state, 200, (uint8_t *) hash );
+			break;
+		case 1:
+			cn_groestl( (const BitSequence *) state, 200, (BitSequence *) hash );
+			break;
+		case 2:
+			cn_jh( (const BitSequence *) state, 200, (BitSequence *) hash );
+			break;
+		case 3:
+			cn_skein( (const BitSequence *) state, 200, (BitSequence *) hash );
+			break;
+		default:
+			break;
+		}
 
-	// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
-	// and expect an accurate result for target > 32-bit without implementing carries
+		// Note that comparison is equivalent to subtraction - we can't just compare 8 32-bit values
+		// and expect an accurate result for target > 32-bit without implementing carries
 
-	if ( hash[3] < target )
-	{
-		uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
+		if ( hash[3] < target )
+		{
+			uint32_t idx = atomicInc( d_res_count, 0xFFFFFFFF );
 
-		if(idx < 10)
-			d_res_nonce[idx] = thread;
+			if(idx < 10)
+				d_res_nonce[idx] = thread;
+		}
 	}
 }
 
@@ -258,6 +267,9 @@ extern "C" void cryptonight_extra_cpu_set_data( nvid_ctx* ctx, const void *data,
 
 extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 {
+	CU_CHECK(ctx->device_id, cuDeviceGet(&ctx->cuDevice, ctx->device_id));
+    CU_CHECK(ctx->device_id, cuCtxCreate(&ctx->cuContext, 0, ctx->cuDevice));
+
 	cudaError_t err;
 	err = cudaSetDevice(ctx->device_id);
 	if(err != cudaSuccess)
@@ -287,19 +299,22 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	// prefer shared memory over L1 cache
 	CUDA_CHECK(ctx->device_id, cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
 
-	size_t hashMemSize = std::max(
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-		cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-	);
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+
+	size_t hashMemSize = 0;
+	for(const auto algo : neededAlgorithms)
+	{
+		hashMemSize = std::max(hashMemSize, algo.Mem());
+	}
 
 	size_t wsize = ctx->device_blocks * ctx->device_threads;
 	CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state, 50 * sizeof(uint32_t) * wsize));
 	size_t ctx_b_size = 4 * sizeof(uint32_t) * wsize;
 	if(
-		cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-		cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-		cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-		cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() ||
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() ||
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() ||
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end()
 	)
 	{
 		// extent ctx_b to hold the state of idx0
@@ -307,11 +322,22 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 		// create a double buffer for the state to exchange the mixed state to phase1
 		CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_state2, 50 * sizeof(uint32_t) * wsize));
 	}
-	else if(cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-			cryptonight_monero_v8 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
+	else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_conceal) != neededAlgorithms.end())
+	{
+		ctx_b_size += sizeof(uint32_t) * 4 * wsize;
+	}
+	else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end())
 	{
-		// bx1 (16byte), division_result (8byte) and sqrt_result (8byte)
-		ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize;
+		// bx0 (16byte), bx1 (16byte), division_result (8byte) and sqrt_result (8byte), padding (16byte)
+		ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize;
+	}
+	else if(
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r) != neededAlgorithms.end() ||
+		std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end()
+	)
+	{
+		// bx0 (16byte), bx1 (16byte), and [r0, r1, r2, r3] (a 8byte)
+		ctx_b_size = 4 * 4 * sizeof(uint32_t) * wsize;
 	}
 	else
 		ctx->d_ctx_state2 = ctx->d_ctx_state;
@@ -332,7 +358,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	return 1;
 }
 
-extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, xmrstak_algo miner_algo)
+extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce, const xmrstak_algo& miner_algo)
 {
 	int threadsperblock = 128;
 	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
@@ -360,11 +386,26 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_bittube2><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
 			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
 	}
-	if(miner_algo == cryptonight_monero_v8)
+	else if(miner_algo == cryptonight_monero_v8)
 	{
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_monero_v8><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
 			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
 	}
+	else if(miner_algo == cryptonight_gpu)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_gpu><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
+	else if(miner_algo == cryptonight_r)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_r><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
+	else if(miner_algo == cryptonight_r_wow)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_r_wow><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
 	else
 	{
 		/* pass two times d_ctx_state because the second state is used later in phase1,
@@ -375,7 +416,7 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 	}
 }
 
-extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce,xmrstak_algo miner_algo)
+extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, uint64_t target, uint32_t* rescount, uint32_t *resnonce, const xmrstak_algo& miner_algo)
 {
 	int threadsperblock = 128;
 	uint32_t wsize = ctx->device_blocks * ctx->device_threads;
@@ -409,7 +450,7 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
 			cryptonight_extra_gpu_final<cryptonight_superfast><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
 		);
-	} 
+	}
 	else if(miner_algo == cryptonight_bittube2)
 	{
 		CUDA_CHECK_MSG_KERNEL(
@@ -418,6 +459,15 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 			cryptonight_extra_gpu_final<cryptonight_bittube2><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
 		);
 	}
+	else if(miner_algo == cryptonight_gpu)
+	{
+		// fallback for all other algorithms
+		CUDA_CHECK_MSG_KERNEL(
+			ctx->device_id,
+			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
+			cryptonight_extra_gpu_final<cryptonight_gpu><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
+		);
+	}
 	else
 	{
 		// fallback for all other algorithms
@@ -571,6 +621,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		}
 	}
 
+	auto neededAlgorithms = ::jconf::inst()->GetCurrentCoinSelection().GetAllAlgorithms();
+	bool useCryptonight_gpu = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_gpu) != neededAlgorithms.end();
+
+
 	// set all device option those marked as auto (-1) to a valid value
 	if(ctx->device_blocks == -1)
 	{
@@ -578,8 +632,11 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 *   - 3 * SMX count for >=sm_30
 		 *   - 2 * SMX count for  <sm_30
 		 */
-		ctx->device_blocks = props.multiProcessorCount *
-			( props.major < 3 ? 2 : 3 );
+		ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 2 : 3);
+
+		// use 6 blocks per SM for sm_2X else 8 blocks
+		if(useCryptonight_gpu)
+			ctx->device_blocks = props.multiProcessorCount * (props.major < 3 ? 6 : 8);
 
 		// increase bfactor for low end devices to avoid that the miner is killed by the OS
 		if(props.multiProcessorCount <= 6)
@@ -591,7 +648,16 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		 * `cryptonight_core_gpu_phase1` and `cryptonight_core_gpu_phase3` starts
 		 * `8 * ctx->device_threads` threads per block
 		 */
-		ctx->device_threads = 64;
+		const uint32_t maxThreadsPerBlock = props.major < 3 ? 512 : 1024;
+
+		// for the most algorithms we are using 8 threads per hash
+		uint32_t threadsPerHash = 8;
+
+		// phase2_gpu uses 16 threads per hash
+		if(useCryptonight_gpu)
+			threadsPerHash = 16;
+
+		ctx->device_threads = maxThreadsPerBlock / threadsPerHash;
 		constexpr size_t byteToMiB = 1024u * 1024u;
 
 		// no limit by default 1TiB
@@ -656,10 +722,11 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		ctx->total_device_memory = totalMemory;
 		ctx->free_device_memory = freeMemory;
 
-		size_t hashMemSize = std::max(
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()),
-			cn_select_memory(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot())
-		);
+		size_t hashMemSize = 0;
+		for(const auto algo : neededAlgorithms)
+		{
+			hashMemSize = std::max(hashMemSize, algo.Mem());
+		}
 
 #ifdef WIN32
 		/* We use in windows bfactor (split slow kernel into smaller parts) to avoid
@@ -688,10 +755,10 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		// 680bytes are extra meta data memory per hash
 		size_t perThread = hashMemSize + 16192u + 680u;
 		if(
-			cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-			cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-			cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-			cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
+			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end() ||
+			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_haven) != neededAlgorithms.end() ||
+			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_bittube2) != neededAlgorithms.end() ||
+			std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_superfast) != neededAlgorithms.end()
 		)
 			perThread += 50 * 4; // state double buffer
 
@@ -700,19 +767,18 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		// use only odd number of threads
 		ctx->device_threads = ctx->device_threads & 0xFFFFFFFE;
 
-		if(props.major == 2 && ctx->device_threads > 64)
+		if(ctx->device_threads > maxThreadsPerBlock / threadsPerHash)
 		{
-			// Fermi gpus only support 512 threads per block (we need start 4 * configured threads)
-			ctx->device_threads = 64;
+			ctx->device_threads = maxThreadsPerBlock / threadsPerHash;
 		}
 
 		// check if cryptonight_monero_v8 is selected for the user pool
-		bool useCryptonight_v8 =
-			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_monero_v8 ||
-			::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_monero_v8;
+		bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end());
+		bool useCryptonight_r = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r) != neededAlgorithms.end());
+		bool useCryptonight_r_wow = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_r_wow) != neededAlgorithms.end());
 
 		// overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0
-		if(useCryptonight_v8 && gpuArch >= 50)
+		if((useCryptonight_v8 || useCryptonight_r || useCryptonight_r_wow) && gpuArch >= 50)
 		{
 			// 4 based on my test maybe it must be adjusted later
 			size_t threads = 4;
@@ -725,6 +791,28 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 				ctx->device_blocks = blockOptimal;
 			}
 		}
+		else if(useCryptonight_gpu)
+		{
+			// 8 based on my profiling sessions maybe it must be adjusted later
+			size_t threads = 8;
+			// 8 is chosen by checking the occupancy calculator
+			size_t blockOptimal = 8 * ctx->device_mpcount;
+
+			// the following values are calculated with CUDA10 and the occupancy calculator
+			if(gpuArch == 35 || gpuArch/10 == 5 || gpuArch/10 == 6)
+				blockOptimal = 7 *  ctx->device_mpcount;
+			if(gpuArch == 37)
+				blockOptimal = 14 *  ctx->device_mpcount;
+			if(gpuArch >= 70)
+				blockOptimal = 6 *  ctx->device_mpcount;
+
+			if(blockOptimal * threads * hashMemSize < limitedMemory)
+			{
+				ctx->device_threads = threads;
+				ctx->device_blocks = blockOptimal;
+			}
+
+		}
 	}
 	printf("device init succeeded\n");
 
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
index 99c651645..c75c74964 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
@@ -1,3 +1,23 @@
+#pragma once
+
+#include "cuda_extra.hpp"
+
+#ifdef __CUDACC__
+__constant__
+#else
+const
+#endif
+uint64_t keccakf_rndc[24] ={
+	0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+	0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+	0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+	0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+	0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+	0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+	0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
+};
+
 #if __CUDA_ARCH__ >= 350
 	__forceinline__ __device__ uint64_t cuda_rotl64(const uint64_t value, const int offset)
 	{
diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp
index c57416f2e..5a8a51703 100644
--- a/xmrstak/cli/cli-miner.cpp
+++ b/xmrstak/cli/cli-miner.cpp
@@ -811,7 +811,7 @@ int main(int argc, char *argv[])
 	printer::inst()->print_str("This currency is a way for us to implement the ideas that we were unable to in\n");
 	printer::inst()->print_str("Monero. See https://github.com/fireice-uk/cryptonote-speedup-demo for details.\n");
 	printer::inst()->print_str("-------------------------------------------------------------------\n");
-	printer::inst()->print_msg(L0, "Mining coin: %s", jconf::inst()->GetMiningCoin().c_str());
+	printer::inst()->print_msg(L0, "Mining coin: %s", ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo().Name().c_str());
 
 	if(params::inst().benchmark_block_version >= 0)
 	{
@@ -875,13 +875,12 @@ int do_benchmark(int block_version, int wait_sec, int work_sec)
 	/* AMD and NVIDIA is currently only supporting work sizes up to 84byte
 	 * \todo fix this issue
 	 */
-	xmrstak::miner_work benchWork = xmrstak::miner_work("", work, 84, 0, false, 0);
 	printer::inst()->print_msg(L0, "Start a %d second benchmark...",work_sec);
-	xmrstak::globalStates::inst().switch_work(benchWork, dat);
+	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 84, 0, false, 0, 0), dat);
 	uint64_t iStartStamp = get_timestamp_ms();
 
 	std::this_thread::sleep_for(std::chrono::seconds(work_sec));
-	xmrstak::globalStates::inst().switch_work(oWork, dat);
+	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 84, 0, false, 0, 0), dat);
 
 	double fTotalHps = 0.0;
 	for (uint32_t i = 0; i < pvThreads->size(); i++)
diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl
index 73ae054c2..d8fd861a7 100644
--- a/xmrstak/config.tpl
+++ b/xmrstak/config.tpl
@@ -25,15 +25,16 @@ R"===(// generated by XMRSTAK_VERSION
  * performance monitors, there is very little reason to spew out pages of text instead of concise reports.
  * Press 'h' (hashrate), 'r' (results) or 'c' (connection) to print reports.
  *
- * verbose_level - 0 - Don't print anything.
- *                 1 - Print intro, connection event, disconnect event
- *                 2 - All of level 1, and new job (block) event if the difficulty is different from the last job
- *                 3 - All of level 1, and new job (block) event in all cases, result submission event.
- *                 4 - All of level 3, and automatic hashrate report printing
+ * verbose_level - 0  - Don't print anything.
+ *                 1  - Print intro, connection event, disconnect event
+ *                 2  - All of level 1, and new job (block) event if the difficulty is different from the last job
+ *                 3  - All of level 1, and new job (block) event in all cases, result submission event.
+ *                 4  - All of level 3, and automatic hashrate report printing
+ *                 10 - Debug level for developer
  *
  * print_motd    - Display messages from your pool operator in the hashrate result.
  */
-"verbose_level" : 3,
+"verbose_level" : 4,
 "print_motd" : true,
 
 /*
@@ -42,7 +43,7 @@ R"===(// generated by XMRSTAK_VERSION
  * h_print_time - How often, in seconds, should we print a hashrate report if verbose_level is set to 4.
  *                This option has no effect if verbose_level is not 4.
  */
-"h_print_time" : 60,
+"h_print_time" : 300,
 
 /*
  * Manual hardware AES override
diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp
index 93e217519..8f20078aa 100644
--- a/xmrstak/http/webdesign.cpp
+++ b/xmrstak/http/webdesign.cpp
@@ -157,7 +157,7 @@ extern const char sHtmlHashrateBodyHigh [] =
 		"<tr><th>Thread ID</th><th>10s</th><th>60s</th><th>15m</th><th rowspan='%u'>H/s</td></tr>";
 
 extern const char sHtmlHashrateTableRow [] =
-	"<tr><th>%u</th><td>%s</td><td>%s</td><td>%s</td></tr>";
+	"<tr><th>%s</th><td>%s</td><td>%s</td><td>%s</td></tr>";
 
 extern const char sHtmlHashrateBodyLow [] =
 		"<tr><th>Totals:</th><td>%s</td><td>%s</td><td>%s</td></tr>"
@@ -168,6 +168,7 @@ extern const char sHtmlHashrateBodyLow [] =
 extern const char sHtmlConnectionBodyHigh [] =
 	"<div class='data'>"
 	"<table>"
+		"<tr><th>Rig ID</th><td>%s</td></tr>"
 		"<tr><th>Pool address</th><td>%s</td></tr>"
 		"<tr><th>Connected since</th><td>%s</td></tr>"
 		"<tr><th>Pool ping time</th><td>%u ms</td></tr>"
@@ -185,6 +186,7 @@ extern const char sHtmlConnectionBodyLow [] =
 extern const char sHtmlResultBodyHigh [] =
 	"<div class='data'>"
 	"<table>"
+		"<tr><th>Currency</th><td>%s</td></tr>"
 		"<tr><th>Difficulty</th><td>%u</td></tr>"
 		"<tr><th>Good results</th><td>%u / %u (%.1f %%)</td></tr>"
 		"<tr><th>Avg result time</th><td>%.1f sec</td></tr>"
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 2a2dc8dbc..e60420234 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -87,31 +87,39 @@ constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
 
 xmrstak::coin_selection coins[] = {
 	// name, userpool, devpool, default_pool_suggestion
-	{ "aeon7",               {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     "mine.aeon-pool.com:5555" },
-	{ "bbscoin",             {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr },
-	{ "bittube",             {cryptonight_heavy, cryptonight_bittube2, 255u},     {cryptonight_heavy, cryptonight_heavy, 0u},"mining.bit.tube:13333"},
-	{ "cryptonight",         {cryptonight_monero_v8, cryptonight, 255u},          {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "cryptonight_bittube2",{cryptonight_heavy, cryptonight_bittube2, 255u},     {cryptonight_heavy, cryptonight_heavy, 0u},nullptr},
-	{ "cryptonight_masari",  {cryptonight_monero_v8, cryptonight_masari, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr },
-	{ "cryptonight_haven",   {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "cryptonight_heavy",   {cryptonight_heavy, cryptonight_heavy, 0u},          {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "cryptonight_lite",    {cryptonight_aeon, cryptonight_lite, 255u},          {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
-	{ "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
-	{ "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u},      {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr },
-	{ "cryptonight_superfast",   {cryptonight_heavy, cryptonight_superfast, 255u},{cryptonight_heavy, cryptonight_superfast, 0u},   nullptr },
-	{ "cryptonight_v7",      {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "cryptonight_v8",      {cryptonight_monero_v8, cryptonight_monero_v8, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "freehaven",           {cryptonight_heavy, cryptonight_superfast, 255u},    {cryptonight_heavy, cryptonight_superfast, 0u},   nullptr },
-	{ "graft",               {cryptonight_monero_v8, cryptonight_monero_v8, 0u},    {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "haven",               {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "intense",             {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "masari",              {cryptonight_monero_v8, cryptonight_masari, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr },
-	{ "monero",              {cryptonight_monero_v8, cryptonight_monero_v8, 0u},     {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, "pool.usxmrpool.com:3333" },
-	{ "qrl",             	 {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},          {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "stellite",            {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "turtlecoin",          {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr }
+	{ "aeon7",                   {POW(cryptonight_aeon)},      {POW(cryptonight_aeon)}, "mine.aeon-pool.com:5555" },
+	{ "bbscoin",                 {POW(cryptonight_aeon)},      {POW(cryptonight_aeon)}, nullptr },
+	{ "bittube",                 {POW(cryptonight_bittube2)},  {POW(cryptonight_gpu)}, "mining.bit.tube:13333" },
+	{ "cryptonight",             {POW(cryptonight)},           {POW(cryptonight_gpu)}, nullptr },
+	{ "cryptonight_bittube2",    {POW(cryptonight_bittube2)},  {POW(cryptonight_gpu)}, nullptr },
+	{ "cryptonight_masari",      {POW(cryptonight_masari)},    {POW(cryptonight_gpu)}, nullptr },
+	{ "cryptonight_haven",       {POW(cryptonight_haven)},     {POW(cryptonight_gpu)}, nullptr },
+	{ "cryptonight_heavy",       {POW(cryptonight_heavy)},     {POW(cryptonight_gpu)}, nullptr },
+	{ "cryptonight_lite",        {POW(cryptonight_lite)},      {POW(cryptonight_aeon)},      nullptr },
+	{ "cryptonight_lite_v7",     {POW(cryptonight_aeon)},      {POW(cryptonight_aeon)},      nullptr },
+	{ "cryptonight_lite_v7_xor", {POW(cryptonight_ipbc)},      {POW(cryptonight_aeon)},      nullptr },
+	{ "cryptonight_r",           {POW(cryptonight_r)},         {POW(cryptonight_r),10,POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_superfast",   {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr },
+	{ "cryptonight_turtle",      {POW(cryptonight_turtle)},    {POW(cryptonight_turtle)},    nullptr },
+	{ "cryptonight_v7",          {POW(cryptonight_monero)},    {POW(cryptonight_gpu)}, nullptr },
+	{ "cryptonight_v8",          {POW(cryptonight_monero_v8)}, {POW(cryptonight_r),10,POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_v8_half",     {POW(cryptonight_v8_half)},   {POW(cryptonight_gpu)}, nullptr },
+	{ "cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)},{POW(cryptonight_gpu)}, nullptr },
+	{ "cryptonight_v7_stellite", {POW(cryptonight_stellite)},  {POW(cryptonight_gpu)}, nullptr },
+	{ "cryptonight_gpu",         {POW(cryptonight_gpu)},       {POW(cryptonight_gpu)},       "pool.ryo-currency.com:3333" },
+	{ "cryptonight_conceal",     {POW(cryptonight_conceal)},   {POW(cryptonight_gpu)}, nullptr },
+	{ "freehaven",               {POW(cryptonight_superfast)}, {POW(cryptonight_gpu)}, nullptr },
+	{ "graft",                   {POW(cryptonight_monero_v8)}, {POW(cryptonight_gpu)}, nullptr },
+	{ "haven",                   {POW(cryptonight_haven)},     {POW(cryptonight_gpu)}, nullptr },
+	{ "lethean",                 {POW(cryptonight_monero)},    {POW(cryptonight_gpu)}, nullptr },
+	{ "masari",                  {POW(cryptonight_v8_half)},   {POW(cryptonight_gpu)}, nullptr },
+	{ "monero",                  {POW(cryptonight_r),10,POW(cryptonight_monero_v8)}, {POW(cryptonight_r),10,POW(cryptonight_monero_v8)}, "pool.usxmrpool.com:3333" },
+	{ "qrl",             	     {POW(cryptonight_monero)},    {POW(cryptonight_gpu)}, nullptr },
+	{ "ryo",                     {POW(cryptonight_gpu)},       {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333" },
+	{ "stellite",                {POW(cryptonight_v8_half)},   {POW(cryptonight_gpu)}, nullptr },
+	{ "turtlecoin",              {POW(cryptonight_turtle), 6u,POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr },
+	{ "plenteum",			     {POW(cryptonight_turtle)},    {POW(cryptonight_turtle)},    nullptr },
+	{ "zelerius",                {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)},   {POW(cryptonight_gpu)}, nullptr }
 };
 
 constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0]));
diff --git a/xmrstak/misc/coinDescription.hpp b/xmrstak/misc/coinDescription.hpp
index 55e86f4e2..65dee143c 100644
--- a/xmrstak/misc/coinDescription.hpp
+++ b/xmrstak/misc/coinDescription.hpp
@@ -4,19 +4,24 @@
 
 #include <stdlib.h>
 #include <string>
-
+#include <vector>
+#include <algorithm>
 
 namespace xmrstak
 {
 	struct coinDescription
 	{
-		xmrstak_algo algo = xmrstak_algo::invalid_algo;
-		xmrstak_algo algo_root = xmrstak_algo::invalid_algo;
+		xmrstak_algo algo = {xmrstak_algo_id::invalid_algo};
 		uint8_t fork_version = 0u;
+		xmrstak_algo algo_root = {xmrstak_algo_id::invalid_algo};
 
 		coinDescription() = default;
 
-		coinDescription(const xmrstak_algo in_algo, xmrstak_algo in_algo_root, const uint8_t in_fork_version) :
+		coinDescription(
+			const xmrstak_algo in_algo,
+			const uint8_t in_fork_version = 0,
+			xmrstak_algo in_algo_root = xmrstak_algo_id::invalid_algo
+		) :
 			algo(in_algo), algo_root(in_algo_root), fork_version(in_fork_version)
 		{}
 
@@ -56,5 +61,27 @@ namespace xmrstak
 			coinDescription tmp = (poolId == 0 ? pool_coin[1] : pool_coin[0]);
 			return tmp;
 		}
+
+		/** return all POW algorithm for the current selected currency
+		 *
+		 * @return required POW algorithms without duplicated entries
+		 */
+		inline std::vector<xmrstak_algo> GetAllAlgorithms()
+		{
+			std::vector<xmrstak_algo> allAlgos = {
+				GetDescription(0).GetMiningAlgo(),
+				GetDescription(0).GetMiningAlgoRoot(),
+				GetDescription(1).GetMiningAlgo(),
+				GetDescription(1).GetMiningAlgoRoot()
+			};
+
+			std::sort(allAlgos.begin(), allAlgos.end());
+			std::remove(allAlgos.begin(), allAlgos.end(), invalid_algo);
+			auto last = std::unique(allAlgos.begin(), allAlgos.end());
+			// remove duplicated algorithms
+			allAlgos.erase(last, allAlgos.end());
+
+			return allAlgos;
+		}
 	};
 } // namespace xmrstak
diff --git a/xmrstak/misc/console.hpp b/xmrstak/misc/console.hpp
index 5d78772c3..6df6597c6 100644
--- a/xmrstak/misc/console.hpp
+++ b/xmrstak/misc/console.hpp
@@ -21,7 +21,7 @@ inline long long unsigned int int_port(size_t i)
 	return i;
 }
 
-enum verbosity : size_t { L0 = 0, L1 = 1, L2 = 2, L3 = 3, L4 = 4, LINF = 100};
+enum verbosity : size_t { L0 = 0, L1 = 1, L2 = 2, L3 = 3, L4 = 4, LDEBUG = 10, LINF = 100};
 
 class printer
 {
diff --git a/xmrstak/misc/executor.cpp b/xmrstak/misc/executor.cpp
index c99c76d93..d3af4048b 100644
--- a/xmrstak/misc/executor.cpp
+++ b/xmrstak/misc/executor.cpp
@@ -124,9 +124,8 @@ bool executor::get_live_pools(std::vector<jpsock*>& eval_pools, bool is_dev)
 			if(xmrstak::globalStates::inst().pool_id != invalid_pool_id)
 			{
 				printer::inst()->print_msg(L0, "All pools are dead. Idling...");
-				auto work = xmrstak::miner_work();
 				xmrstak::pool_data dat;
-				xmrstak::globalStates::inst().switch_work(work, dat);
+				xmrstak::globalStates::inst().switch_work(xmrstak::miner_work(), dat);
 			}
 
 			if(over_limit == pool_count)
@@ -364,13 +363,12 @@ void executor::on_pool_have_job(size_t pool_id, pool_job& oPoolJob)
 
 	jpsock* pool = pick_pool_by_id(pool_id);
 
-	xmrstak::miner_work oWork(oPoolJob.sJobID, oPoolJob.bWorkBlob, oPoolJob.iWorkLen, oPoolJob.iTarget, pool->is_nicehash(), pool_id);
-
 	xmrstak::pool_data dat;
 	dat.iSavedNonce = oPoolJob.iSavedNonce;
 	dat.pool_id = pool_id;
 
-	xmrstak::globalStates::inst().switch_work(oWork, dat);
+	xmrstak::globalStates::inst().switch_work(xmrstak::miner_work(oPoolJob.sJobID, oPoolJob.bWorkBlob, 
+		oPoolJob.iWorkLen, oPoolJob.iTarget, pool->is_nicehash(), pool_id, oPoolJob.iBlockHeight), dat);
 
 	if(dat.pool_id != pool_id)
 	{
@@ -445,7 +443,7 @@ void executor::on_miner_result(size_t pool_id, job_result& oResult)
 	if(bResult)
 	{
 		uint64_t* targets = (uint64_t*)oResult.bResult;
-		log_result_ok(jpsock::t64_to_diff(targets[3]));
+		log_result_ok(t64_to_diff(targets[3]));
 		printer::inst()->print_msg(L3, "Result accepted by the pool.");
 	}
 	else
@@ -555,34 +553,34 @@ void executor::ex_main()
 	{
 	case cryptonight_heavy:
 		if(dev_tls)
-			pools.emplace_front(0, "donate.xmr-stak.net:8888", "", "", "", 0.0, true, true, "", true);
+			pools.emplace_front(0, "pool.loki.hashvault.pro:443", "L7tapzgnQ4oN9CkUfS2oyiLbrfDPWoxycZMJUpN5VvxdX4s4hPQv8Ja5YHnwGwYCib3Jp9agD28tucz6viPQeHqqR49KPHG", "", "hide", 0.0, true, true, "", false);
 		else
 			pools.emplace_front(0, "pool.loki.hashvault.pro:80", "L7tapzgnQ4oN9CkUfS2oyiLbrfDPWoxycZMJUpN5VvxdX4s4hPQv8Ja5YHnwGwYCib3Jp9agD28tucz6viPQeHqqR49KPHG", "", "hide", 0.0, true, false, "", false);
 		break;
+	case cryptonight_gpu:
+		if(dev_tls)
+			pools.emplace_front(0, "donate.xmr-stak.net:8811", "", "", "", 0.0, true, true, "", false);
+		else
+			pools.emplace_front(0, "donate.xmr-stak.net:5511", "", "", "", 0.0, true, false, "", false);
+		break;
 	case cryptonight_monero_v8:
-	case cryptonight_monero:
+	case cryptonight_r:
 		if(dev_tls)
 			pools.emplace_front(0, "pool.supportxmr.com:9000", "47CQgrYtLWf4LnwrFLzmfTAp4VQbr5YjmXxJuuKw6Feujjn8c4HrkWpHAtyi6eGfkcZtj1Xig4EXPAS8vzq6CUq4DhiBjyb", "", "hide", 0.0, true, true, "", false);
 		else
 			pools.emplace_front(0, "pool.supportxmr.com:5555", "47CQgrYtLWf4LnwrFLzmfTAp4VQbr5YjmXxJuuKw6Feujjn8c4HrkWpHAtyi6eGfkcZtj1Xig4EXPAS8vzq6CUq4DhiBjyb", "","hide",0.0, true, false, "", false);
 		break;
-	case cryptonight_ipbc:
 	case cryptonight_aeon:
-	case cryptonight_lite:
 		if(dev_tls)
-			pools.emplace_front(0, "donate.xmr-stak.net:7777", "", "", "", 0.0, true, true, "", true);
+			pools.emplace_front(0, "pool.aeon.hashvault.pro:443", "WmszXjHu7CKC3r7tSbSG8tMzSUKVvMw3HNgDiaH3hD1B7iUTJ6tH4Vpa4jBBtgAJzTJvKSsd5Jst86ybtdBewMkq1fUosyjta", "", "hide", 0.0, true, true, "", false);
 		else
 			pools.emplace_front(0, "pool.aeon.hashvault.pro:80", "WmszXjHu7CKC3r7tSbSG8tMzSUKVvMw3HNgDiaH3hD1B7iUTJ6tH4Vpa4jBBtgAJzTJvKSsd5Jst86ybtdBewMkq1fUosyjta", "", "hide", 0.0, true, false, "", false);
 		break;
-
-	case cryptonight:
+	default:
 		if(dev_tls)
 			pools.emplace_front(0, "donate.xmr-stak.net:6666", "", "", "", 0.0, true, true, "", false);
 		else
-			pools.emplace_front(0, "pool.electroneum.hashvault.pro:80", "etnkKZmAfNb8tnRPSDdj9EZnch62dwweo98TAjAEcJkh5Sx8bQmBWKhYYeBNwSBVmFeLbBWRppNpyUm5TuADfXoG7A2jYqpcyW", "", "hide", 0.0, true, false, "", false);
-		break;
-
-	default:
+			pools.emplace_front(0, "donate.xmr-stak.net:3333", "", "", "", 0.0, true, false, "", false);
 		break;
 	}
 
@@ -883,6 +881,8 @@ void executor::result_report(std::string& out)
 		iTotalRes += vMineResults[i].count;
 
 	out.append("RESULT REPORT\n");
+	out.append("Currency         : ").
+		append(jconf::inst()->GetMiningCoin()).append("\n");
 	if(iTotalRes == 0)
 	{
 		out.append("You haven't found any results yet.\n");
@@ -944,6 +944,7 @@ void executor::connection_report(std::string& out)
 		pool = pick_pool_by_id(last_usr_pool_id);
 
 	out.append("CONNECTION REPORT\n");
+	out.append("Rig ID          : ").append(pool != nullptr ? pool->get_rigid() : "").append(1, '\n');
 	out.append("Pool address    : ").append(pool != nullptr ? pool->get_pool_addr() : "<not connected>").append(1, '\n');
 	if(pool != nullptr && pool->is_running() && pool->is_logged_in())
 		out.append("Connected since : ").append(time_format(date, sizeof(date), tPoolConnTime)).append(1, '\n');
@@ -1039,9 +1040,27 @@ void executor::http_hashrate_report(std::string& out)
 	out.append(buffer);
 
 	double fTotal[3] = { 0.0, 0.0, 0.0};
+	auto bTypePrev = static_cast<xmrstak::iBackend::BackendType>(0);
+	std::string name;
+	size_t j = 0;
 	for(size_t i=0; i < nthd; i++)
 	{
 		double fHps[3];
+		char csThreadTag[25];
+		auto bType = static_cast<xmrstak::iBackend::BackendType>(pvThreads->at(i)->backendType);
+		if(bTypePrev == bType)
+			j++;
+		else
+		{
+			j = 0;
+			bTypePrev = bType;
+			name = xmrstak::iBackend::getName(bType);
+			std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+		}
+		snprintf(csThreadTag, sizeof(csThreadTag),
+			(99 < nthd) ? "[%s.%03u]:%03u" : ((9 < nthd) ? "[%s.%02u]:%02u" : "[%s.%u]:%u"),
+			name.c_str(), (unsigned int)(j), (unsigned int)i
+		);
 
 		fHps[0] = telem->calc_telemetry_data(10000, i);
 		fHps[1] = telem->calc_telemetry_data(60000, i);
@@ -1056,7 +1075,7 @@ void executor::http_hashrate_report(std::string& out)
 		fTotal[1] += fHps[1];
 		fTotal[2] += fHps[2];
 
-		snprintf(buffer, sizeof(buffer), sHtmlHashrateTableRow, (unsigned int)i, num_a, num_b, num_c);
+		snprintf(buffer, sizeof(buffer), sHtmlHashrateTableRow, csThreadTag, num_a, num_b, num_c);
 		out.append(buffer);
 	}
 
@@ -1144,6 +1163,7 @@ void executor::http_connection_report(std::string& out)
 	}
 
 	snprintf(buffer, sizeof(buffer), sHtmlConnectionBodyHigh,
+		jconf::inst()->GetMiningCoin().c_str(),
 		pool != nullptr ? pool->get_pool_addr() : "not connected",
 		cdate, ping_time);
 	out.append(buffer);
diff --git a/xmrstak/misc/jext.hpp b/xmrstak/misc/jext.hpp
index f4a333c22..9936fa813 100644
--- a/xmrstak/misc/jext.hpp
+++ b/xmrstak/misc/jext.hpp
@@ -14,3 +14,49 @@ inline const Value* GetObjectMember(const Value& obj, const char* key)
 	else
 		return nullptr;
 }
+
+#ifdef _MSC_VER
+
+#include <stdlib.h>
+#define bswap_32(x) _byteswap_ulong(x)
+#define bswap_64(x) _byteswap_uint64(x)
+
+#elif defined(__APPLE__)
+
+// Mac OS X / Darwin features
+#include <libkern/OSByteOrder.h>
+#define bswap_32(x) OSSwapInt32(x)
+#define bswap_64(x) OSSwapInt64(x)
+
+#elif defined(__sun) || defined(sun)
+
+#include <sys/byteorder.h>
+#define bswap_32(x) BSWAP_32(x)
+#define bswap_64(x) BSWAP_64(x)
+
+#elif defined(__FreeBSD__)
+
+#include <sys/endian.h>
+#define bswap_32(x) bswap32(x)
+#define bswap_64(x) bswap64(x)
+
+#elif defined(__OpenBSD__)
+
+#include <sys/types.h>
+#define bswap_32(x) swap32(x)
+#define bswap_64(x) swap64(x)
+
+#elif defined(__NetBSD__)
+
+#include <sys/types.h>
+#include <machine/bswap.h>
+#if defined(__BSWAP_RENAME) && !defined(__bswap_32)
+#define bswap_32(x) bswap32(x)
+#define bswap_64(x) bswap64(x)
+#endif
+
+#else
+
+#include <byteswap.h>
+
+#endif
diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp
index 406c535d2..786b18b4f 100644
--- a/xmrstak/net/jpsock.cpp
+++ b/xmrstak/net/jpsock.cpp
@@ -403,11 +403,12 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 	if (!params->val->IsObject())
 		return set_socket_error("PARSE error: Job error 1");
 
-	const Value *blob, *jobid, *target, *motd;
+	const Value *blob, *jobid, *target, *motd, *blk_height;
 	jobid = GetObjectMember(*params->val, "job_id");
 	blob = GetObjectMember(*params->val, "blob");
 	target = GetObjectMember(*params->val, "target");
 	motd = GetObjectMember(*params->val, "motd");
+	blk_height = GetObjectMember(*params->val, "height");
 
 	if (jobid == nullptr || blob == nullptr || target == nullptr ||
 		!jobid->IsString() || !blob->IsString() || !target->IsString())
@@ -445,10 +446,8 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 	// lock reading of oCurrentJob
 	std::unique_lock<std::mutex> jobIdLock(job_mutex);
 	// compare possible non equal length job id's
-	if(iWorkLen == oCurrentJob.iWorkLen &&
-		memcmp(oPoolJob.bWorkBlob, oCurrentJob.bWorkBlob, iWorkLen) == 0 &&
-		strcmp(jobid->GetString(), oCurrentJob.sJobID) == 0
-	)
+	if(iWorkLen == oCurrentJob.iWorkLen && memcmp(oPoolJob.bWorkBlob, oCurrentJob.bWorkBlob, iWorkLen) == 0 &&
+		strcmp(jobid->GetString(), oCurrentJob.sJobID) == 0)
 	{
 		return set_socket_error("Duplicate equal job detected! Please contact your pool admin.");
 	}
@@ -466,7 +465,6 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 		if(!hex2bin(sTempStr, 8, (unsigned char*)&iTempInt) || iTempInt == 0)
 			return set_socket_error("PARSE error: Invalid target");
 
-
 		oPoolJob.iTarget = t32_to_t64(iTempInt);
 	}
 	else if(target_slen <= 16)
@@ -481,6 +479,9 @@ bool jpsock::process_pool_job(const opq_json_val* params, const uint64_t message
 		return set_socket_error("PARSE error: Job error 5");
 
 	iJobDiff = t64_to_diff(oPoolJob.iTarget);
+	
+	if(blk_height != nullptr && blk_height->IsUint64())
+		oPoolJob.iBlockHeight = bswap_64(blk_height->GetUint64());
 
 	std::unique_lock<std::mutex> lck(job_mutex);
 	oCurrentJob = oPoolJob;
@@ -655,13 +656,17 @@ bool jpsock::cmd_login()
 	return true;
 }
 
-bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, xmrstak_algo algo)
+bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, const xmrstak_algo& algo)
 {
 	char cmd_buffer[1024];
 	char sNonce[9];
 	char sResult[65];
 	/*Extensions*/
 	char sAlgo[64] = {0};
+	char sBaseAlgo[64] = {0};
+	char sIterations[32] = {0};
+	char sMemory[32] = {0};
+	char sMemAlignBytes[32] = {0};
 	char sBackend[64] = {0};
 	char sHashcount[128] = {0};
 
@@ -673,48 +678,12 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 
 	if(ext_algo)
 	{
-		const char* algo_name;
-		switch(algo)
-		{
-		case cryptonight:
-			algo_name = "cryptonight";
-			break;
-		case cryptonight_lite:
-			algo_name = "cryptonight_lite";
-			break;
-		case cryptonight_monero:
-			algo_name = "cryptonight_v7";
-			break;
-		case cryptonight_monero_v8:
-			algo_name = "cryptonight_v8";
-			break;
-		case cryptonight_aeon:
-			algo_name = "cryptonight_lite_v7";
-			break;
-		case cryptonight_stellite:
-			algo_name = "cryptonight_v7_stellite";
-			break;
-		case cryptonight_ipbc:
-			algo_name = "cryptonight_lite_v7_xor";
-			break;
-		case cryptonight_heavy:
-			algo_name = "cryptonight_heavy";
-			break;
-		case cryptonight_haven:
-			algo_name = "cryptonight_haven";
-			break;
-		case cryptonight_masari:
-			algo_name = "cryptonight_masari";
-			break;
-		case cryptonight_superfast:
-			algo_name = "cryptonight_superfast";
-			break;
-		default:
-			algo_name = "unknown";
-			break;
-		}
-
-		snprintf(sAlgo, sizeof(sAlgo), ",\"algo\":\"%s\"", algo_name);
+		snprintf(sAlgo, sizeof(sAlgo), ",\"algo\":\"%s\"", algo.Name().c_str());
+		// the real algorithm with three degrees of freedom
+		snprintf(sBaseAlgo, sizeof(sBaseAlgo), ",\"base_algo\":\"%s\"", algo.BaseName().c_str());
+		snprintf(sIterations, sizeof(sIterations), ",\"iterations\":\"0x%08x\"", algo.Iter());
+		snprintf(sMemory, sizeof(sMemory), ",\"scratchpad\":\"0x%08x\"", (uint32_t)algo.Mem());
+		snprintf(sMemAlignBytes, sizeof(sMemAlignBytes), ",\"mask\":\"0x%08x\"", algo.Mask());
 	}
 
 	bin2hex((unsigned char*)&iNonce, 4, sNonce);
@@ -723,8 +692,8 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 	bin2hex(bResult, 32, sResult);
 	sResult[64] = '\0';
 
-	snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s},\"id\":1}\n",
-		sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo);
+	snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s%s%s%s%s},\"id\":1}\n",
+		sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sIterations,sMemory, sMemAlignBytes);
 
 	uint64_t messageId = 0;
 	opq_json_val oResult(nullptr);
diff --git a/xmrstak/net/jpsock.hpp b/xmrstak/net/jpsock.hpp
index ad34f6c86..949764813 100644
--- a/xmrstak/net/jpsock.hpp
+++ b/xmrstak/net/jpsock.hpp
@@ -35,7 +35,7 @@ class jpsock
 	void disconnect(bool quiet = false);
 
 	bool cmd_login();
-	bool cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, xmrstak_algo algo);
+	bool cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bResult, const char* backend_name, uint64_t backend_hashcount, uint64_t total_hashcount, const xmrstak_algo& algo);
 
 	static bool hex2bin(const char* in, unsigned int len, unsigned char* out);
 	static void bin2hex(const unsigned char* in, unsigned int len, char* out);
@@ -58,6 +58,7 @@ class jpsock
 	inline bool get_disconnects(size_t& att, size_t& time) { att = connect_attempts; time = disconnect_time != 0 ? get_timestamp() - disconnect_time + 1 : 0; return pool && usr_login[0]; }
 	inline const char* get_pool_addr() { return net_addr.c_str(); }
 	inline const char* get_tls_fp() { return tls_fp.c_str(); }
+	inline const char* get_rigid() { return usr_rigid.c_str(); }
 	inline bool is_nicehash() { return nicehash; }
 
 	bool get_pool_motd(std::string& strin);
@@ -65,11 +66,6 @@ class jpsock
 	std::string&& get_call_error();
 	bool have_call_error() { return call_error; }
 	bool have_sock_error() { return bHaveSocketError; }
-
-	inline static uint64_t t32_to_t64(uint32_t t) { return 0xFFFFFFFFFFFFFFFFULL / (0xFFFFFFFFULL / ((uint64_t)t)); }
-	inline static uint64_t t64_to_diff(uint64_t t) { return 0xFFFFFFFFFFFFFFFFULL / t; }
-	inline static uint64_t diff_to_t64(uint64_t d) { return 0xFFFFFFFFFFFFFFFFULL / d; }
-
 	inline uint64_t get_current_diff() { return iJobDiff; }
 
 	void save_nonce(uint32_t nonce);
diff --git a/xmrstak/net/msgstruct.hpp b/xmrstak/net/msgstruct.hpp
index 6a05eb9d5..813fc7d06 100644
--- a/xmrstak/net/msgstruct.hpp
+++ b/xmrstak/net/msgstruct.hpp
@@ -16,6 +16,7 @@ struct pool_job
 	uint64_t	iTarget;
 	uint32_t	iWorkLen;
 	uint32_t	iSavedNonce;
+	uint64_t	iBlockHeight = uint64_t(-1);
 
 	pool_job() : iWorkLen(0), iSavedNonce(0) {}
 	pool_job(const char* sJobID, uint64_t iTarget, const uint8_t* bWorkBlob, uint32_t iWorkLen) :
@@ -33,10 +34,10 @@ struct job_result
 	char		sJobID[64];
 	uint32_t	iNonce;
 	uint32_t	iThreadId;
-	xmrstak_algo algorithm = invalid_algo;
+	xmrstak_algo algorithm = {invalid_algo};
 
 	job_result() {}
-	job_result(const char* sJobID, uint32_t iNonce, const uint8_t* bResult, uint32_t iThreadId, xmrstak_algo algo) :
+	job_result(const char* sJobID, uint32_t iNonce, const uint8_t* bResult, uint32_t iThreadId, const xmrstak_algo& algo) :
 		iNonce(iNonce), iThreadId(iThreadId), algorithm(algo)
 	{
 		memcpy(this->sJobID, sJobID, sizeof(job_result::sJobID));
@@ -175,6 +176,10 @@ struct ex_event
 	}
 };
 
+inline uint64_t t32_to_t64(uint32_t t) { return 0xFFFFFFFFFFFFFFFFULL / (0xFFFFFFFFULL / ((uint64_t)t)); }
+inline uint64_t t64_to_diff(uint64_t t) { return 0xFFFFFFFFFFFFFFFFULL / t; }
+inline uint64_t diff_to_t64(uint64_t d) { return 0xFFFFFFFFFFFFFFFFULL / d; }
+
 #include <chrono>
 //Get steady_clock timestamp - misc helper function
 inline size_t get_timestamp()
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index 58762de56..f8f1d7d6c 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -33,18 +33,24 @@ POOLCONF],
  *    qrl - Quantum Resistant Ledger
  *    ryo
  *    turtlecoin
+ *    plenteum
  *
  * Native algorithms which not depends on any block versions:
  *
+ *    # 256KiB scratchpad memory
+ *    cryptonight_turtle
  *    # 1MiB scratchpad memory
  *    cryptonight_lite
  *    cryptonight_lite_v7
  *    cryptonight_lite_v7_xor (algorithm used by ipbc)
  *    # 2MiB scratchpad memory
  *    cryptonight
+ *    cryptonight_gpu (for Ryo's 14th of Feb fork)
  *    cryptonight_superfast
  *    cryptonight_v7
  *    cryptonight_v8
+ *    cryptonight_v8_half (used by masari and stellite)
+ *    cryptonight_v8_zelerius
  *    # 4MiB scratchpad memory
  *    cryptonight_bittube2
  *    cryptonight_haven
diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index 5ea1d1d04..4a793065c 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -18,7 +18,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.7.1-hide-2.2.1"
+#define XMR_STAK_VERSION "2.9.0-hide-3.0.0"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"