Merge pull request #2635 from psychocrypt/rx-topic-refactorAutoSugges…

…tion [RX] CPU: numa support/better autoconfig
fireice-uk · Dec 17, 2019 · 8cdf4a4 · 8cdf4a4
2 parents 3f8c373 + 32381fd
commit 8cdf4a4
Show file tree

Hide file tree

Showing 21 changed files with 466 additions and 284 deletions.
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -164,7 +164,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	size_t scratchPadSize = 0;
 	for(const auto algo : neededAlgorithms)
 	{
-		scratchPadSize = std::max(scratchPadSize, algo.Mem());
+		scratchPadSize = std::max(scratchPadSize, algo.L3());
 	}
 
 	size_t g_thd = ctx->rawIntensity;
@@ -182,7 +182,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			ctx->rx_dataset[ctx->deviceIdx] = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, dataset_size, nullptr, &ret);
 		}
 		else {
-			void* dataset = getRandomXDataset();
+			void* dataset = getRandomXDataset(0);
 			ctx->rx_dataset[ctx->deviceIdx] = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, dataset_size, dataset, &ret);
 		}
 
@@ -193,7 +193,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		}
 	}
 
-	ctx->rx_scratchpads = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, (user_algo.Mem() + 64) * g_thd, nullptr, &ret);
+	ctx->rx_scratchpads = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, (user_algo.L3() + 64) * g_thd, nullptr, &ret);
 	if(ret != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create RandomX scratchpads.", err_to_str(ret));
@@ -294,9 +294,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	for(const auto miner_algo : neededAlgorithms)
 	{
 		// scratchpad size for the selected mining algorithm
-		size_t hashMemSize = miner_algo.Mem();
-		int threadMemMask = miner_algo.Mask();
-		int hashIterations = miner_algo.Iter();
+		size_t hashMemSize = miner_algo.L3();
 
 		std::string options;
 		options += " -DALGO=" + std::to_string(miner_algo.Id());
@@ -1364,7 +1362,7 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 size_t RXSetJob(GpuContext *ctx, uint8_t *input, size_t input_len, uint64_t target, const uint8_t* seed_hash, const xmrstak_algo& miner_algo)
 {
 	cl_int ret;
-	void* dataset = getRandomXDataset();
+	void* dataset = getRandomXDataset(0);
 	const size_t dataset_size = getRandomXDatasetSize();
 
 	if((memcmp(ctx->rx_dataset_seedhash, seed_hash, sizeof(ctx->rx_dataset_seedhash)) != 0))

diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
@@ -84,7 +84,7 @@ class autoAdjust
 		size_t hashMemSize = 0;
 		for(const auto algo : neededAlgorithms)
 		{
-			hashMemSize = std::max(hashMemSize, algo.Mem());
+			hashMemSize = std::max(hashMemSize, algo.L3());
 		}
 
 		std::string conf;
@@ -171,9 +171,10 @@ class autoAdjust
 				ctx.gcnAsm = false;
 
 
-			if(hashMemSize < CN_MEMORY)
+			size_t _2MiB = 2llu * 1024 * 1024;
+			if(hashMemSize < _2MiB)
 			{
-				size_t factor = CN_MEMORY / hashMemSize;
+				size_t factor = _2MiB / hashMemSize;
 				// increase all intensity relative to the original scratchpad size
 				maxThreads *= factor;
 			}

diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
@@ -27,7 +27,7 @@
 
 #include "xmrstak/backend/cpu/crypto/cryptonight.h"
 #include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h"
-#include "xmrstak/backend/cpu/hwlocMemory.hpp"
+#include "xmrstak/backend/cpu/hwlocHelper.hpp"
 #include "xmrstak/backend/cpu/minethd.hpp"
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/misc/configEditor.hpp"
@@ -68,9 +68,11 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th
 
 	order_guard.wait();
 
+#if defined(CONF_NO_HWLOC) || defined(_WIN32)
 	if(affinity >= 0) //-1 means no affinity
 		if(!cpu::minethd::thd_setaffinity(oWorkThd.native_handle(), affinity))
 			printer::inst()->print_msg(L1, "WARNING setting affinity failed.");
+#endif
 }
 
 extern "C"
@@ -164,7 +166,7 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 void minethd::work_main()
 {
 	if(affinity >= 0) //-1 means no affinity
-		bindMemoryToNUMANode(affinity);
+		hwlocBind(affinity);
 
 	order_fix.set_value();
 	std::unique_lock<std::mutex> lck(thd_aff_set);
@@ -173,6 +175,8 @@ void minethd::work_main()
 
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
+	cpu_ctx->numa = affinity < 0 ? 0 : numdaId(affinity);
+	randomX_global_ctx::inst().init(cpu_ctx->numa);
 
 	if(cpu_ctx == nullptr)
 	{

diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -31,7 +31,7 @@ class autoAdjust
 		size_t hashMemSize = 0;
 		for(const auto algo : neededAlgorithms)
 		{
-			hashMemSize = std::max(hashMemSize, algo.Mem());
+			hashMemSize = std::max(hashMemSize, algo.L3());
 		}
 		const size_t hashMemSizeKB = hashMemSize / 1024u;
 

diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -4,6 +4,7 @@
 #include "xmrstak/misc/configEditor.hpp"
 #include "xmrstak/misc/console.hpp"
 #include "xmrstak/params.hpp"
+#include "xmrstak/backend/cpu/hwlocHelper.hpp"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -15,6 +16,7 @@
 
 #include <hwloc.h>
 #include <stdio.h>
+#include <algorithm>
 
 namespace xmrstak
 {
@@ -30,17 +32,18 @@ class autoAdjustHwloc
 
 		for(const auto algo : neededAlgorithms)
 		{
-			hashMemSize = std::max(hashMemSize, algo.Mem());
+			l3MemRequire = std::max(l3MemRequire, algo.L3());
+			l2MemRequire = std::max(l2MemRequire, algo.L2());
 		}
-		halfHashMemSize = hashMemSize / 2u;
 	}
 
 	bool printConfig()
 	{
 
 		hwloc_topology_t topology;
 		hwloc_topology_init(&topology);
-		hwloc_topology_load(topology);
+		if(hwloc_topology_load(topology) < 0)
+			return false;
 
 		std::string conf;
 		configEditor configTpl{};
@@ -54,25 +57,24 @@ class autoAdjustHwloc
 		bool is_successful = true;
 		try
 		{
-			std::vector<hwloc_obj_t> tlcs;
-			tlcs.reserve(16);
-			results.reserve(16);
 
+			std::vector<hwloc_obj_t> tlcs;
 			findChildrenCaches(hwloc_get_root_obj(topology),
 				[&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); });
 
 			if(tlcs.size() == 0)
 				throw(std::runtime_error("The CPU doesn't seem to have a cache."));
-
+			printer::inst()->print_msg(LDEBUG,"process %u cache elements", uint32_t(tlcs.size()));
 			for(hwloc_obj_t obj : tlcs)
 				processTopLevelCache(obj);
 
-			for(uint32_t id : results)
+
+			for(const auto& thd : threads)
 			{
 				conf += std::string("    { \"low_power_mode\" : ");
-				conf += std::string((id & 0x8000000) != 0 ? "true" : "false");
+				conf += std::to_string(thd.num_hashes);
 				conf += std::string(", \"affine_to_cpu\" : ");
-				conf += std::to_string(id & 0x7FFFFFF);
+				conf += std::to_string(thd.core_id);
 				conf += std::string(" },\n");
 			}
 		}
@@ -92,10 +94,20 @@ class autoAdjustHwloc
 	}
 
   private:
-	size_t hashMemSize = 0;
-	size_t halfHashMemSize = 0;
+	size_t l3MemRequire = 0;
+	size_t l2MemRequire = 0;
+
+	struct Thread
+	{
+		Thread(const uint32_t c_id, const uint32_t n_hash) :
+			core_id(c_id), num_hashes(n_hash)
+		{}
+
+		uint32_t core_id = 0;
+		uint32_t num_hashes = 1;
+	};
 
-	std::vector<uint32_t> results;
+	std::vector<Thread> threads;
 
 	template <typename func>
 	inline void findChildrenByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambda)
@@ -143,16 +155,16 @@ class autoAdjustHwloc
 		if(obj->attr == nullptr)
 			throw(std::runtime_error("Cache object hasn't got attributes."));
 
-		size_t PUs = 0;
-		findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; });
+		size_t numPUs = 0;
+		findChildrenByType(obj, HWLOC_OBJ_PU, [&numPUs](hwloc_obj_t found) { numPUs++; });
 
 		//Strange case, but we will handle it silently, surely there must be one PU somewhere?
-		if(PUs == 0)
+		if(numPUs == 0)
 			return;
 
 		if(obj->attr->cache.size == 0)
 		{
-			//We will always have one child if PUs > 0
+			//We will always have one child if numPUs > 0
 			if(!isCacheObject(obj->children[0]))
 				throw(std::runtime_error("The CPU doesn't seem to have a cache."));
 
@@ -162,27 +174,58 @@ class autoAdjustHwloc
 			return;
 		}
 
-		size_t cacheSize = obj->attr->cache.size;
-		if(isCacheExclusive(obj))
+		size_t l3CacheSize = obj->attr->cache.size;
+		size_t numL2Caches = obj->arity;
+		bool isExclusive = isCacheExclusive(obj);
+		size_t l2CacheSize = 0u;
+		if(obj->attr->cache.depth == 3)
 		{
-			for(size_t i = 0; i < obj->arity; i++)
+			for(size_t i = 0; i < numL2Caches; i++)
 			{
 				hwloc_obj_t l2obj = obj->children[i];
-				//If L2 is exclusive and greater or equal to 2MB add room for one more hash
-				if(isCacheObject(l2obj) && l2obj->attr != nullptr && l2obj->attr->cache.size >= hashMemSize)
-					cacheSize += hashMemSize;
+				if(isCacheObject(l2obj) && l2obj->attr)
+				{
+					//If L3 is exclusive and greater or equal to 2MB add room for one more hash
+					if(isExclusive && l2obj->attr->cache.size >= l3MemRequire)
+						l3CacheSize += l3MemRequire;
+					else
+						l2CacheSize += l2obj->attr->cache.size;
+				}
 			}
 		}
 
+		size_t l2CacheSizePerHash = l2CacheSize / numL2Caches;
+		printer::inst()->print_msg(LDEBUG,"%u L3 cache, required per hash %u", uint32_t(l3CacheSize), uint32_t(l3MemRequire));
+		printer::inst()->print_msg(LDEBUG,"%u L2 cache, required per hash %u", uint32_t(l2CacheSize), uint32_t(l2MemRequire));
+
+		size_t l3CacheHashes = std::max(l3CacheSize / l3MemRequire, size_t(1u));
+		size_t l2CacheHashes = std::max(l2CacheSizePerHash / l2MemRequire, size_t(1u)) * numL2Caches;
+
+		// we have no lvl2 cache or our top lvl cache is L2
+		if(l2CacheSize == 0u)
+			l2CacheHashes = l3CacheHashes;
+
 		std::vector<hwloc_obj_t> cores;
 		cores.reserve(16);
 		findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); });
 
-		size_t cacheHashes = (cacheSize + halfHashMemSize) / hashMemSize;
+		printer::inst()->print_msg(LDEBUG,"%u L3 hash limit", uint32_t(l3CacheHashes));
+		printer::inst()->print_msg(LDEBUG,"%u L2 hash limit", uint32_t(l2CacheHashes));
+		printer::inst()->print_msg(LDEBUG,"%u PU(s) available", uint32_t(numPUs));
+		size_t numHashCacheLimited = std::min(l2CacheHashes, l3CacheHashes);
+		// do not use more PUs than available
+		size_t usePus = std::min(numHashCacheLimited, numPUs);
+
+		// currently do not use multi hash per PU (all tests has shown it is slower)
+		//size_t numHashesPerPu = std::max(numHashCacheLimited / numPUs, size_t(1u));
+		size_t numHashesPerPu = 1u;
+
+		printer::inst()->print_msg(LDEBUG,"use %u PU(s)", uint32_t(usePus));
+		printer::inst()->print_msg(LDEBUG,"use %u hashe(s) per pu", uint32_t(numHashesPerPu));
 
 		//Firstly allocate PU 0 of every CORE, then PU 1 etc.
 		size_t pu_id = 0;
-		while(cacheHashes > 0 && PUs > 0)
+		while(usePus > 0)
 		{
 			bool allocated_pu = false;
 			for(hwloc_obj_t core : cores)
@@ -192,19 +235,12 @@ class autoAdjustHwloc
 
 				size_t os_id = core->children[pu_id]->os_index;
 
-				if(cacheHashes > PUs)
-				{
-					cacheHashes -= 2;
-					os_id |= 0x8000000; //double hash marker bit
-				}
-				else
-					cacheHashes--;
-				PUs--;
-
 				allocated_pu = true;
-				results.emplace_back(os_id);
+				threads.emplace_back(Thread(os_id, numHashesPerPu));
+
+				usePus--;
 
-				if(cacheHashes == 0)
+				if(usePus == 0)
 					break;
 			}
 

diff --git a/xmrstak/backend/cpu/crypto/common/VirtualMemory_unix.cpp b/xmrstak/backend/cpu/crypto/common/VirtualMemory_unix.cpp
@@ -37,6 +37,11 @@
 #   include <mach/vm_statistics.h>
 #endif
 
+#if defined(__linux__) && !defined(MAP_HUGE_SHIFT)
+#	include <asm-generic/mman-common.h>
+#endif
+
+#include "xmrstak/misc/console.hpp"
 
 int xmrstak::VirtualMemory::m_globalFlags = 0;
 
@@ -109,8 +114,7 @@ void *xmrstak::VirtualMemory::allocateLargePagesMemory(size_t size, size_t page_
 		page_size_flags |= MAP_HUGE_2MB;
 	else if(page_size == 1024u)
 		page_size_flags |= MAP_HUGE_1GB;
-	#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
-	#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
+
     void *mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE | page_size_flags, 0, 0);
 #   endif
 
@@ -128,7 +132,16 @@ void xmrstak::VirtualMemory::flushInstructionCache(void *p, size_t size)
 
 void xmrstak::VirtualMemory::freeLargePagesMemory(void *p, size_t size)
 {
-    munmap(p, size);
+    if(munmap(p, size) != 0)
+	{
+		printer::inst()->print_msg(LDEBUG,"munmap failed %llu", (uint64_t)size);
+		size_t page3gib = 3llu*1024*1024*1024;
+		printer::inst()->print_msg(LDEBUG,"try to unmap ", page3gib);
+		if(munmap(p, page3gib) != 0)
+		{
+			printer::inst()->print_msg(LDEBUG,"munmap failed %llu", (uint64_t)page3gib);
+		}
+	}
 }