Skip to content

Commit

Permalink
Merge pull request #2635 from psychocrypt/rx-topic-refactorAutoSugges…
Browse files Browse the repository at this point in the history
…tion

[RX] CPU: numa support/better autoconfig
  • Loading branch information
fireice-uk authored Dec 17, 2019
2 parents 3f8c373 + 32381fd commit 8cdf4a4
Show file tree
Hide file tree
Showing 21 changed files with 466 additions and 284 deletions.
12 changes: 5 additions & 7 deletions xmrstak/backend/amd/amd_gpu/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
size_t scratchPadSize = 0;
for(const auto algo : neededAlgorithms)
{
scratchPadSize = std::max(scratchPadSize, algo.Mem());
scratchPadSize = std::max(scratchPadSize, algo.L3());
}

size_t g_thd = ctx->rawIntensity;
Expand All @@ -182,7 +182,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
ctx->rx_dataset[ctx->deviceIdx] = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, dataset_size, nullptr, &ret);
}
else {
void* dataset = getRandomXDataset();
void* dataset = getRandomXDataset(0);
ctx->rx_dataset[ctx->deviceIdx] = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, dataset_size, dataset, &ret);
}

Expand All @@ -193,7 +193,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
}
}

ctx->rx_scratchpads = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, (user_algo.Mem() + 64) * g_thd, nullptr, &ret);
ctx->rx_scratchpads = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, (user_algo.L3() + 64) * g_thd, nullptr, &ret);
if(ret != CL_SUCCESS)
{
printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create RandomX scratchpads.", err_to_str(ret));
Expand Down Expand Up @@ -294,9 +294,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
for(const auto miner_algo : neededAlgorithms)
{
// scratchpad size for the selected mining algorithm
size_t hashMemSize = miner_algo.Mem();
int threadMemMask = miner_algo.Mask();
int hashIterations = miner_algo.Iter();
size_t hashMemSize = miner_algo.L3();

std::string options;
options += " -DALGO=" + std::to_string(miner_algo.Id());
Expand Down Expand Up @@ -1364,7 +1362,7 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
size_t RXSetJob(GpuContext *ctx, uint8_t *input, size_t input_len, uint64_t target, const uint8_t* seed_hash, const xmrstak_algo& miner_algo)
{
cl_int ret;
void* dataset = getRandomXDataset();
void* dataset = getRandomXDataset(0);
const size_t dataset_size = getRandomXDatasetSize();

if((memcmp(ctx->rx_dataset_seedhash, seed_hash, sizeof(ctx->rx_dataset_seedhash)) != 0))
Expand Down
7 changes: 4 additions & 3 deletions xmrstak/backend/amd/autoAdjust.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class autoAdjust
size_t hashMemSize = 0;
for(const auto algo : neededAlgorithms)
{
hashMemSize = std::max(hashMemSize, algo.Mem());
hashMemSize = std::max(hashMemSize, algo.L3());
}

std::string conf;
Expand Down Expand Up @@ -171,9 +171,10 @@ class autoAdjust
ctx.gcnAsm = false;


if(hashMemSize < CN_MEMORY)
size_t _2MiB = 2llu * 1024 * 1024;
if(hashMemSize < _2MiB)
{
size_t factor = CN_MEMORY / hashMemSize;
size_t factor = _2MiB / hashMemSize;
// increase all intensity relative to the original scratchpad size
maxThreads *= factor;
}
Expand Down
8 changes: 6 additions & 2 deletions xmrstak/backend/amd/minethd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

#include "xmrstak/backend/cpu/crypto/cryptonight.h"
#include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h"
#include "xmrstak/backend/cpu/hwlocMemory.hpp"
#include "xmrstak/backend/cpu/hwlocHelper.hpp"
#include "xmrstak/backend/cpu/minethd.hpp"
#include "xmrstak/jconf.hpp"
#include "xmrstak/misc/configEditor.hpp"
Expand Down Expand Up @@ -68,9 +68,11 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th

order_guard.wait();

#if defined(CONF_NO_HWLOC) || defined(_WIN32)
if(affinity >= 0) //-1 means no affinity
if(!cpu::minethd::thd_setaffinity(oWorkThd.native_handle(), affinity))
printer::inst()->print_msg(L1, "WARNING setting affinity failed.");
#endif
}

extern "C"
Expand Down Expand Up @@ -164,7 +166,7 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
void minethd::work_main()
{
if(affinity >= 0) //-1 means no affinity
bindMemoryToNUMANode(affinity);
hwlocBind(affinity);

order_fix.set_value();
std::unique_lock<std::mutex> lck(thd_aff_set);
Expand All @@ -173,6 +175,8 @@ void minethd::work_main()

cryptonight_ctx* cpu_ctx;
cpu_ctx = cpu::minethd::minethd_alloc_ctx();
cpu_ctx->numa = affinity < 0 ? 0 : numdaId(affinity);
randomX_global_ctx::inst().init(cpu_ctx->numa);

if(cpu_ctx == nullptr)
{
Expand Down
2 changes: 1 addition & 1 deletion xmrstak/backend/cpu/autoAdjust.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class autoAdjust
size_t hashMemSize = 0;
for(const auto algo : neededAlgorithms)
{
hashMemSize = std::max(hashMemSize, algo.Mem());
hashMemSize = std::max(hashMemSize, algo.L3());
}
const size_t hashMemSizeKB = hashMemSize / 1024u;

Expand Down
108 changes: 72 additions & 36 deletions xmrstak/backend/cpu/autoAdjustHwloc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "xmrstak/misc/configEditor.hpp"
#include "xmrstak/misc/console.hpp"
#include "xmrstak/params.hpp"
#include "xmrstak/backend/cpu/hwlocHelper.hpp"

#ifdef _WIN32
#include <windows.h>
Expand All @@ -15,6 +16,7 @@

#include <hwloc.h>
#include <stdio.h>
#include <algorithm>

namespace xmrstak
{
Expand All @@ -30,17 +32,18 @@ class autoAdjustHwloc

for(const auto algo : neededAlgorithms)
{
hashMemSize = std::max(hashMemSize, algo.Mem());
l3MemRequire = std::max(l3MemRequire, algo.L3());
l2MemRequire = std::max(l2MemRequire, algo.L2());
}
halfHashMemSize = hashMemSize / 2u;
}

bool printConfig()
{

hwloc_topology_t topology;
hwloc_topology_init(&topology);
hwloc_topology_load(topology);
if(hwloc_topology_load(topology) < 0)
return false;

std::string conf;
configEditor configTpl{};
Expand All @@ -54,25 +57,24 @@ class autoAdjustHwloc
bool is_successful = true;
try
{
std::vector<hwloc_obj_t> tlcs;
tlcs.reserve(16);
results.reserve(16);

std::vector<hwloc_obj_t> tlcs;
findChildrenCaches(hwloc_get_root_obj(topology),
[&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); });

if(tlcs.size() == 0)
throw(std::runtime_error("The CPU doesn't seem to have a cache."));

printer::inst()->print_msg(LDEBUG,"process %u cache elements", uint32_t(tlcs.size()));
for(hwloc_obj_t obj : tlcs)
processTopLevelCache(obj);

for(uint32_t id : results)

for(const auto& thd : threads)
{
conf += std::string(" { \"low_power_mode\" : ");
conf += std::string((id & 0x8000000) != 0 ? "true" : "false");
conf += std::to_string(thd.num_hashes);
conf += std::string(", \"affine_to_cpu\" : ");
conf += std::to_string(id & 0x7FFFFFF);
conf += std::to_string(thd.core_id);
conf += std::string(" },\n");
}
}
Expand All @@ -92,10 +94,20 @@ class autoAdjustHwloc
}

private:
size_t hashMemSize = 0;
size_t halfHashMemSize = 0;
size_t l3MemRequire = 0;
size_t l2MemRequire = 0;

struct Thread
{
Thread(const uint32_t c_id, const uint32_t n_hash) :
core_id(c_id), num_hashes(n_hash)
{}

uint32_t core_id = 0;
uint32_t num_hashes = 1;
};

std::vector<uint32_t> results;
std::vector<Thread> threads;

template <typename func>
inline void findChildrenByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambda)
Expand Down Expand Up @@ -143,16 +155,16 @@ class autoAdjustHwloc
if(obj->attr == nullptr)
throw(std::runtime_error("Cache object hasn't got attributes."));

size_t PUs = 0;
findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; });
size_t numPUs = 0;
findChildrenByType(obj, HWLOC_OBJ_PU, [&numPUs](hwloc_obj_t found) { numPUs++; });

//Strange case, but we will handle it silently, surely there must be one PU somewhere?
if(PUs == 0)
if(numPUs == 0)
return;

if(obj->attr->cache.size == 0)
{
//We will always have one child if PUs > 0
//We will always have one child if numPUs > 0
if(!isCacheObject(obj->children[0]))
throw(std::runtime_error("The CPU doesn't seem to have a cache."));

Expand All @@ -162,27 +174,58 @@ class autoAdjustHwloc
return;
}

size_t cacheSize = obj->attr->cache.size;
if(isCacheExclusive(obj))
size_t l3CacheSize = obj->attr->cache.size;
size_t numL2Caches = obj->arity;
bool isExclusive = isCacheExclusive(obj);
size_t l2CacheSize = 0u;
if(obj->attr->cache.depth == 3)
{
for(size_t i = 0; i < obj->arity; i++)
for(size_t i = 0; i < numL2Caches; i++)
{
hwloc_obj_t l2obj = obj->children[i];
//If L2 is exclusive and greater or equal to 2MB add room for one more hash
if(isCacheObject(l2obj) && l2obj->attr != nullptr && l2obj->attr->cache.size >= hashMemSize)
cacheSize += hashMemSize;
if(isCacheObject(l2obj) && l2obj->attr)
{
//If L3 is exclusive and greater or equal to 2MB add room for one more hash
if(isExclusive && l2obj->attr->cache.size >= l3MemRequire)
l3CacheSize += l3MemRequire;
else
l2CacheSize += l2obj->attr->cache.size;
}
}
}

size_t l2CacheSizePerHash = l2CacheSize / numL2Caches;
printer::inst()->print_msg(LDEBUG,"%u L3 cache, required per hash %u", uint32_t(l3CacheSize), uint32_t(l3MemRequire));
printer::inst()->print_msg(LDEBUG,"%u L2 cache, required per hash %u", uint32_t(l2CacheSize), uint32_t(l2MemRequire));

size_t l3CacheHashes = std::max(l3CacheSize / l3MemRequire, size_t(1u));
size_t l2CacheHashes = std::max(l2CacheSizePerHash / l2MemRequire, size_t(1u)) * numL2Caches;

// we have no lvl2 cache or our top lvl cache is L2
if(l2CacheSize == 0u)
l2CacheHashes = l3CacheHashes;

std::vector<hwloc_obj_t> cores;
cores.reserve(16);
findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); });

size_t cacheHashes = (cacheSize + halfHashMemSize) / hashMemSize;
printer::inst()->print_msg(LDEBUG,"%u L3 hash limit", uint32_t(l3CacheHashes));
printer::inst()->print_msg(LDEBUG,"%u L2 hash limit", uint32_t(l2CacheHashes));
printer::inst()->print_msg(LDEBUG,"%u PU(s) available", uint32_t(numPUs));
size_t numHashCacheLimited = std::min(l2CacheHashes, l3CacheHashes);
// do not use more PUs than available
size_t usePus = std::min(numHashCacheLimited, numPUs);

// currently do not use multi hash per PU (all tests has shown it is slower)
//size_t numHashesPerPu = std::max(numHashCacheLimited / numPUs, size_t(1u));
size_t numHashesPerPu = 1u;

printer::inst()->print_msg(LDEBUG,"use %u PU(s)", uint32_t(usePus));
printer::inst()->print_msg(LDEBUG,"use %u hashe(s) per pu", uint32_t(numHashesPerPu));

//Firstly allocate PU 0 of every CORE, then PU 1 etc.
size_t pu_id = 0;
while(cacheHashes > 0 && PUs > 0)
while(usePus > 0)
{
bool allocated_pu = false;
for(hwloc_obj_t core : cores)
Expand All @@ -192,19 +235,12 @@ class autoAdjustHwloc

size_t os_id = core->children[pu_id]->os_index;

if(cacheHashes > PUs)
{
cacheHashes -= 2;
os_id |= 0x8000000; //double hash marker bit
}
else
cacheHashes--;
PUs--;

allocated_pu = true;
results.emplace_back(os_id);
threads.emplace_back(Thread(os_id, numHashesPerPu));

usePus--;

if(cacheHashes == 0)
if(usePus == 0)
break;
}

Expand Down
19 changes: 16 additions & 3 deletions xmrstak/backend/cpu/crypto/common/VirtualMemory_unix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@
# include <mach/vm_statistics.h>
#endif

#if defined(__linux__) && !defined(MAP_HUGE_SHIFT)
# include <asm-generic/mman-common.h>
#endif

#include "xmrstak/misc/console.hpp"

int xmrstak::VirtualMemory::m_globalFlags = 0;

Expand Down Expand Up @@ -109,8 +114,7 @@ void *xmrstak::VirtualMemory::allocateLargePagesMemory(size_t size, size_t page_
page_size_flags |= MAP_HUGE_2MB;
else if(page_size == 1024u)
page_size_flags |= MAP_HUGE_1GB;
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)

void *mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE | page_size_flags, 0, 0);
# endif

Expand All @@ -128,7 +132,16 @@ void xmrstak::VirtualMemory::flushInstructionCache(void *p, size_t size)

void xmrstak::VirtualMemory::freeLargePagesMemory(void *p, size_t size)
{
munmap(p, size);
if(munmap(p, size) != 0)
{
printer::inst()->print_msg(LDEBUG,"munmap failed %llu", (uint64_t)size);
size_t page3gib = 3llu*1024*1024*1024;
printer::inst()->print_msg(LDEBUG,"try to unmap ", page3gib);
if(munmap(p, page3gib) != 0)
{
printer::inst()->print_msg(LDEBUG,"munmap failed %llu", (uint64_t)page3gib);
}
}
}


Expand Down
Loading

0 comments on commit 8cdf4a4

Please sign in to comment.