-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Debug Out Of Memory (OOM) Errors in Simulation and Production
From version 7.0, jemalloc is used by FDB. Thus we can use jemalloc's profiling capability, with some minimal changes:
diff --git a/cmake/Jemalloc.cmake b/cmake/Jemalloc.cmake
index aedb83480..e2836a11f 100644
--- a/cmake/Jemalloc.cmake
+++ b/cmake/Jemalloc.cmake
@@ -15,7 +15,9 @@ ExternalProject_add(Jemalloc_project
"${JEMALLOC_DIR}/lib/libjemalloc.a"
"${JEMALLOC_DIR}/lib/libjemalloc_pic.a"
PATCH_COMMAND patch -p1 < ${CMAKE_SOURCE_DIR}/cmake/jemalloc.patch
- CONFIGURE_COMMAND ./configure --prefix=${JEMALLOC_DIR} --enable-static --disable-cxx --enable-prof
+ CONFIGURE_COMMAND ./configure --prefix=${JEMALLOC_DIR} --enable-static --disable-cxx --enable-prof --enable-stats --
with-malloc-conf=prof:true,prof_prefix:/var/tmp/fdbserver
BUILD_IN_SOURCE ON
BUILD_COMMAND make
INSTALL_DIR "${JEMALLOC_DIR}"
Note --with-malloc-conf
option is used for configuring jemalloc. This is needed because fdbmonitor
can't set the environmental variable MALLOC_CONF
when invoking fdbserver
. Build fdb
with USE_CUSTOM_JEMALLOC
(v7.3) set to ON
to trigger a build that includes a jemalloc
with above --with-malloc-conf
changes.
Another change needed is to remove SignalSafeUnwind.*
, because chain_dl_iterate_phdr
in glibc was overwritten in the file, which causes deadlocks during jemalloc initialization. As a result, SlowTaskWorkload.actor.cpp
should also be removed. Finally, flow
should be linked with jemalloc
so that many binaries can link correctly.
diff --git a/fdbserver/CMakeLists.txt b/fdbserver/CMakeLists.txt
index 31d8c3ed8..bd6d32f9c 100644
--- a/fdbserver/CMakeLists.txt
+++ b/fdbserver/CMakeLists.txt
@@ -280,7 +280,7 @@ set(FDBSERVER_SRCS
workloads/Sideband.actor.cpp
workloads/SidebandSingle.actor.cpp
workloads/SimpleAtomicAdd.actor.cpp
- workloads/SlowTaskWorkload.actor.cpp
+# workloads/SlowTaskWorkload.actor.cpp
workloads/SnapTest.actor.cpp
workloads/SpecialKeySpaceCorrectness.actor.cpp
workloads/StatusWorkload.actor.cpp
diff --git a/flow/CMakeLists.txt b/flow/CMakeLists.txt
index 5cd37810b..801c8e732 100644
--- a/flow/CMakeLists.txt
+++ b/flow/CMakeLists.txt
@@ -55,8 +55,8 @@ set(FLOW_SRCS
Profiler.actor.cpp
Profiler.h
SendBufferIterator.h
- SignalSafeUnwind.cpp
- SignalSafeUnwind.h
+# SignalSafeUnwind.cpp
+# SignalSafeUnwind.h
SimpleOpt.h
StreamCipher.h
SystemMonitor.cpp
@@ -135,6 +135,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.cmake ${CMAKE_CURRENT_BINARY
add_flow_target(STATIC_LIBRARY NAME flow SRCS ${FLOW_SRCS})
target_link_libraries(flow PRIVATE stacktrace)
target_link_libraries(flow PUBLIC fmt::fmt)
+target_link_libraries(flow PRIVATE jemalloc)
add_flow_target(STATIC_LIBRARY NAME flow_sampling SRCS ${FLOW_SRCS})
target_link_libraries(flow_sampling PRIVATE stacktrace)
To debug OOM, we may want to dump the profile when FDB exits with an error. The following change achieves this:
diff --git a/flow/Platform.actor.cpp b/flow/Platform.actor.cpp
index 748050b37..8b7f999aa 100644
--- a/flow/Platform.actor.cpp
+++ b/flow/Platform.actor.cpp
@@ -57,6 +57,10 @@
#include "fdbclient/AnnotateActor.h"
+#ifdef USE_JEMALLOC
+#include <jemalloc/jemalloc.h>
+#endif
+
#ifdef _WIN32
#include <windows.h>
#include <winioctl.h>
@@ -3247,6 +3251,12 @@ extern "C" void flushAndExit(int exitCode) {
// to the crashAndDie call below.
TerminateProcess(GetCurrentProcess(), exitCode);
#else
+#ifdef USE_JEMALLOC
+ // malloc_stats_print(nullptr, nullptr, nullptr);
+ if (exitCode != FDB_EXIT_SUCCESS) {
+ mallctl("prof.dump", nullptr, nullptr, nullptr, 0);
+ }
+#endif
// Send a signal to allow the Kernel to generate a coredump for this process.
// See: https://man7.org/linux/man-pages/man5/core.5.html
// The abort method will send a SIGABRT, which causes the kernel to collect a coredump.
Once we have the heap profile, we can generate the figure with unstripped fdbserver
binary:
jeprof --show_bytes --pdf fdbserver jeprof.82759.48.i48.heap > figure.pdf
The resultant figure.pdf
has accounting of all malloc
calls made by fdb
; not counted are the mmap
"magazine" allocations done by FastAlloc
.
See Leak Checking and Heap Profiling. Also see fdbserver 7.x on sqlite OOM on RHEL9 if inexplicable rising resident memory.
- Install gperftools if needed (skip this step if using the development docker), e.g.,
yum install -y gperftools-devel gperftools-libs gperftools ghostscript.x86_64 gv.x86_64
- Compile with gperf tools:
cmake -DUSE_GPERFTOOLS=1 ../foundationdb -G Ninja; ninja
(may need to comment out).target_compile_definitions(gperftools PUBLIC USE_GPERFTOOLS)
incmake/FindGperftools.cmake
- Run with gperftools enabled:
HEAPPROFILE=/tmp/fdbserver fdbserver [args...]
- Profile the heap profile:
pprof-symbolize gperf-build/bin/fdbserver /tmp/fdbserver.0065.heap
Note that the profiling runs are at least 10X slower than the runs without profiling.
See a sample profile here.
See massif manual.
- Compile with Valgrind, e.g.,
cmake -S ${HOME}/src/foundationdb -B ${HOME}/build_output -D USE_CCACHE=ON -D USE_WERROR=ON -D USE_VALGRIND=ON -G Ninja && ninja -C ${HOME}/build_output -j 80 fdbserver
- Run with massif tool, e.g.,
valgrind --tool=massif ./build_output/bin/fdbserver -r simulation --crash --logsize 1024MB -f ./foundationdb/tests/fast/ConfigureLocked.toml -s 93093841 -b on
-
GetMagazineSample
logs when the fast allocators adds more magazines, the backtraces will be reliably the problem. -
HugeArenaSample
could point to arenas that eventually get deallocated, so it might not be a memory leak.