Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix an issue with sysconf returning the wrong last level cache values on Linux running on certain AMD Processors #109567

Merged
merged 2 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/coreclr/gc/gcconfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,9 @@ class GCConfigStringHolder
INT_CONFIG (GCWriteBarrier, "GCWriteBarrier", NULL, 0, "Specifies whether GC should use more precise but slower write barrier") \
STRING_CONFIG(GCName, "GCName", "System.GC.Name", "Specifies the path of the standalone GC implementation.") \
INT_CONFIG (GCSpinCountUnit, "GCSpinCountUnit", 0, 0, "Specifies the spin count unit used by the GC.") \
INT_CONFIG (GCDynamicAdaptationMode, "GCDynamicAdaptationMode", "System.GC.DynamicAdaptationMode", 0, "Enable the GC to dynamically adapt to application sizes.")
INT_CONFIG (GCDynamicAdaptationMode, "GCDynamicAdaptationMode", "System.GC.DynamicAdaptationMode", 0, "Enable the GC to dynamically adapt to application sizes.") \
BOOL_CONFIG (GCCacheSizeFromSysConf, "GCCacheSizeFromSysConf", NULL, false, "Specifies using sysconf to retrieve the last level cache size for Unix.")

// This class is responsible for retreiving configuration information
// for how the GC should operate.
class GCConfig
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/gc/unix/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
set(CMAKE_INCLUDE_CURRENT_DIR ON)
include_directories("../env")
include_directories("..")

include(configure.cmake)

Expand Down
207 changes: 104 additions & 103 deletions src/coreclr/gc/unix/gcenv.unix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@
#include "gcenv.structs.h"
#include "gcenv.base.h"
#include "gcenv.os.h"
#include "gcenv.ee.h"
#include "gcenv.unix.inl"
#include "volatile.h"
#include "gcconfig.h"
#include "numasupport.h"

#if HAVE_SWAPCTL
Expand Down Expand Up @@ -792,101 +794,125 @@ bool ReadMemoryValueFromFile(const char* filename, uint64_t* val)
return result;
}

#define UPDATE_CACHE_SIZE_AND_LEVEL(NEW_CACHE_SIZE, NEW_CACHE_LEVEL) if (NEW_CACHE_SIZE > ((long)cacheSize)) { cacheSize = NEW_CACHE_SIZE; cacheLevel = NEW_CACHE_LEVEL; }

static size_t GetLogicalProcessorCacheSizeFromOS()
static void GetLogicalProcessorCacheSizeFromSysConf(size_t* cacheLevel, size_t* cacheSize)
{
size_t cacheLevel = 0;
size_t cacheSize = 0;
long size;
assert (cacheLevel != nullptr);
assert (cacheSize != nullptr);

// sysconf can return -1 if the cache size is unavailable in some distributions and 0 in others.
// UPDATE_CACHE_SIZE_AND_LEVEL should handle both the cases by not updating cacheSize if either of cases are met.
#ifdef _SC_LEVEL1_DCACHE_SIZE
size = sysconf(_SC_LEVEL1_DCACHE_SIZE);
UPDATE_CACHE_SIZE_AND_LEVEL(size, 1)
#endif
#ifdef _SC_LEVEL2_CACHE_SIZE
size = sysconf(_SC_LEVEL2_CACHE_SIZE);
UPDATE_CACHE_SIZE_AND_LEVEL(size, 2)
#endif
#ifdef _SC_LEVEL3_CACHE_SIZE
size = sysconf(_SC_LEVEL3_CACHE_SIZE);
UPDATE_CACHE_SIZE_AND_LEVEL(size, 3)
#endif
#ifdef _SC_LEVEL4_CACHE_SIZE
size = sysconf(_SC_LEVEL4_CACHE_SIZE);
UPDATE_CACHE_SIZE_AND_LEVEL(size, 4)
#endif
#if defined(_SC_LEVEL1_DCACHE_SIZE) || defined(_SC_LEVEL2_CACHE_SIZE) || defined(_SC_LEVEL3_CACHE_SIZE) || defined(_SC_LEVEL4_CACHE_SIZE)
const int cacheLevelNames[] =
{
_SC_LEVEL1_DCACHE_SIZE,
_SC_LEVEL2_CACHE_SIZE,
_SC_LEVEL3_CACHE_SIZE,
_SC_LEVEL4_CACHE_SIZE,
};

#if defined(TARGET_LINUX) && !defined(HOST_ARM) && !defined(HOST_X86)
if (cacheSize == 0)
for (int i = ARRAY_SIZE(cacheLevelNames) - 1; i >= 0; i--)
{
//
// Fallback to retrieve cachesize via /sys/.. if sysconf was not available
// for the platform. Currently musl and arm64 should be only cases to use
// this method to determine cache size.
//
size_t level;
char path_to_size_file[] = "/sys/devices/system/cpu/cpu0/cache/index-/size";
char path_to_level_file[] = "/sys/devices/system/cpu/cpu0/cache/index-/level";
int index = 40;
assert(path_to_size_file[index] == '-');
assert(path_to_level_file[index] == '-');

for (int i = 0; i < 5; i++)
long size = sysconf(cacheLevelNames[i]);
if (size > 0)
{
path_to_size_file[index] = (char)(48 + i);
*cacheSize = (size_t)size;
*cacheLevel = i + 1;
break;
}
}
#endif
}

uint64_t cache_size_from_sys_file = 0;
static void GetLogicalProcessorCacheSizeFromSysFs(size_t* cacheLevel, size_t* cacheSize)
{
assert (cacheLevel != nullptr);
assert (cacheSize != nullptr);

if (ReadMemoryValueFromFile(path_to_size_file, &cache_size_from_sys_file))
{
// uint64_t to long conversion as ReadMemoryValueFromFile takes a uint64_t* as an argument for the val argument.
size = (long)cache_size_from_sys_file;
path_to_level_file[index] = (char)(48 + i);
#if defined(TARGET_LINUX) && !defined(HOST_ARM) && !defined(HOST_X86)
//
// Retrieve cachesize via sysfs by reading the file /sys/devices/system/cpu/cpu0/cache/index{LastLevelCache}/size
// for the platform. Currently musl and arm64 should be only cases to use
// this method to determine cache size.
//
size_t level;
char path_to_size_file[] = "/sys/devices/system/cpu/cpu0/cache/index-/size";
char path_to_level_file[] = "/sys/devices/system/cpu/cpu0/cache/index-/level";
int index = 40;
assert(path_to_size_file[index] == '-');
assert(path_to_level_file[index] == '-');

for (int i = 0; i < 5; i++)
{
path_to_size_file[index] = (char)(48 + i);

if (ReadMemoryValueFromFile(path_to_level_file, &level))
{
UPDATE_CACHE_SIZE_AND_LEVEL(size, level)
}
uint64_t cache_size_from_sys_file = 0;

else
{
cacheSize = std::max((long)cacheSize, size);
}
if (ReadMemoryValueFromFile(path_to_size_file, &cache_size_from_sys_file))
{
*cacheSize = std::max(*cacheSize, (size_t)cache_size_from_sys_file);

path_to_level_file[index] = (char)(48 + i);
if (ReadMemoryValueFromFile(path_to_level_file, &level))
{
*cacheLevel = level;
}
}
}
#endif
#endif
}

#if (defined(HOST_ARM64) || defined(HOST_LOONGARCH64)) && !defined(TARGET_APPLE)
if (cacheSize == 0)
static void GetLogicalProcessorCacheSizeFromHeuristic(size_t* cacheLevel, size_t* cacheSize)
{
assert (cacheLevel != nullptr);
assert (cacheSize != nullptr);

#if (defined(TARGET_LINUX) && !defined(TARGET_APPLE))
{
// We expect to get the L3 cache size for Arm64 but currently expected to be missing that info
// from most of the machines.
//
// _SC_LEVEL*_*CACHE_SIZE is not yet present. Work is in progress to enable this for arm64
//
// /sys/devices/system/cpu/cpu*/cache/index*/ is also not yet present in most systems.
// Arm64 patch is in Linux kernel tip.
//
// midr_el1 is available in "/sys/devices/system/cpu/cpu0/regs/identification/midr_el1",
// but without an exhaustive list of ARM64 processors any decode of midr_el1
// Would likely be incomplete

// Published information on ARM64 architectures is limited.
// If we use recent high core count chips as a guide for state of the art, we find
// total L3 cache to be 1-2MB/core. As always, there are exceptions.

// Estimate cache size based on CPU count
// Assume lower core count are lighter weight parts which are likely to have smaller caches
// Assume L3$/CPU grows linearly from 256K to 1.5M/CPU as logicalCPUs grows from 2 to 12 CPUs
// Use the following heuristics at best depending on the CPU count
// 1 ~ 4 : 4 MB
// 5 ~ 16 : 8 MB
// 17 ~ 64 : 16 MB
// 65+ : 32 MB
DWORD logicalCPUs = g_processAffinitySet.Count();
if (logicalCPUs < 5)
{
*cacheSize = 4;
}
else if (logicalCPUs < 17)
{
*cacheSize = 8;
}
else if (logicalCPUs < 65)
{
*cacheSize = 16;
}
else
{
*cacheSize = 32;
}

cacheSize = logicalCPUs * std::min(1536, std::max(256, (int)logicalCPUs * 128)) * 1024;
*cacheSize *= (1024 * 1024);
}
#endif
}

static size_t GetLogicalProcessorCacheSizeFromOS()
{
size_t cacheLevel = 0;
size_t cacheSize = 0;

if (GCConfig::GetGCCacheSizeFromSysConf())
{
GetLogicalProcessorCacheSizeFromSysConf(&cacheLevel, &cacheSize);
}

if (cacheSize == 0)
{
GetLogicalProcessorCacheSizeFromSysFs(&cacheLevel, &cacheSize);
if (cacheSize == 0)
{
GetLogicalProcessorCacheSizeFromHeuristic(&cacheLevel, &cacheSize);
}
}

#if HAVE_SYSCTLBYNAME
if (cacheSize == 0)
Expand All @@ -905,40 +931,15 @@ static size_t GetLogicalProcessorCacheSizeFromOS()
if (success)
{
assert(cacheSizeFromSysctl > 0);
cacheSize = ( size_t) cacheSizeFromSysctl;
cacheSize = (size_t) cacheSizeFromSysctl;
}
}
#endif

#if (defined(HOST_ARM64) || defined(HOST_LOONGARCH64)) && !defined(TARGET_APPLE)
if (cacheLevel != 3)
{
// We expect to get the L3 cache size for Arm64 but currently expected to be missing that info
// from most of the machines.
// Hence, just use the following heuristics at best depending on the CPU count
// 1 ~ 4 : 4 MB
// 5 ~ 16 : 8 MB
// 17 ~ 64 : 16 MB
// 65+ : 32 MB
DWORD logicalCPUs = g_processAffinitySet.Count();
if (logicalCPUs < 5)
{
cacheSize = 4;
}
else if (logicalCPUs < 17)
{
cacheSize = 8;
}
else if (logicalCPUs < 65)
{
cacheSize = 16;
}
else
{
cacheSize = 32;
}

cacheSize *= (1024 * 1024);
GetLogicalProcessorCacheSizeFromHeuristic(&cacheLevel, &cacheSize);
}
#endif

Expand Down
Loading