Skip to content

Commit

Permalink
Fix some test failures
Browse files Browse the repository at this point in the history
  • Loading branch information
Flamefire committed Aug 5, 2022
1 parent 117af79 commit ccef9b2
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 0 deletions.
6 changes: 6 additions & 0 deletions easybuild/easyconfigs/p/PyTorch/PyTorch-1.9.0-foss-2020b.eb
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ patches = [
'PyTorch-1.8.1_fix-faulty-asserts-and-skip-test.patch',
'PyTorch-1.8.1_increase-distributed-test-timeout.patch',
'PyTorch-1.9.0_avoid-failures-in-test_unary_ufuncs.patch',
'PyTorch-1.9.0_fix-kineto-crash.patch',
'PyTorch-1.9.0_fix-vsx-vector-functions.patch',
'PyTorch-1.9.0_skip-lstm-serialization-test.patch',
]
Expand All @@ -46,6 +47,7 @@ checksums = [
'7a6e512274f0b8673f4f207a5bc53387d88be7e79833f42d20365668b2118071',
# PyTorch-1.9.0_avoid-failures-in-test_unary_ufuncs.patch
'f600e6831f8a03af007845687d1e0f65b2394ca89a9dab5178e2cdc9bd384d43',
'1ed5e125f7922ea577d43053a6652aedc21cc036157e101c0e3b9aee9029d3b0', # PyTorch-1.9.0_fix-kineto-crash.patch
'a4733b6b16a0db4ee5f85f2b103abc29bd711cfc5253f8dd8494d2b0c1509516', # PyTorch-1.9.0_fix-vsx-vector-functions.patch
# PyTorch-1.9.0_skip-lstm-serialization-test.patch
'0fc14e29bd7530bcc09f4212df3c846072b1313216da86b827e102b85d695f49',
Expand Down Expand Up @@ -79,6 +81,10 @@ excluded_tests = {
# Bad tests: https://github.com/pytorch/pytorch/issues/60260
'distributed/elastic/utils/distributed_test',
'distributed/elastic/multiprocessing/api_test',
# These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
# Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
'distributed/test_distributed_fork',
'distributed/test_distributed_spawn',
# Test from this suite timeout often. The process group backend is deprecated anyway
'distributed/rpc/test_process_group_agent',
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ patches = [
'PyTorch-1.9.0_fix-min-amount-of-devices-for-test.patch',
'PyTorch-1.9.0_fix-testnn-on-A100.patch',
'PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch',
'PyTorch-1.9.0_fix-kineto-crash.patch',
'PyTorch-1.9.0_fix-vsx-vector-functions.patch',
'PyTorch-1.9.0_increase-test-cuda-tolerance.patch',
'PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch',
Expand Down Expand Up @@ -64,6 +65,7 @@ checksums = [
'8e8b417782e2f3004462c32338e12685e7296d15207f3e3087dcb8015e648f98', # PyTorch-1.9.0_fix-testnn-on-A100.patch
# PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch
'67960bf9140baf004b07e29f7c2b338e7bc4e4e4f2c931768be44f58526e605f',
'1ed5e125f7922ea577d43053a6652aedc21cc036157e101c0e3b9aee9029d3b0', # PyTorch-1.9.0_fix-kineto-crash.patch
'a4733b6b16a0db4ee5f85f2b103abc29bd711cfc5253f8dd8494d2b0c1509516', # PyTorch-1.9.0_fix-vsx-vector-functions.patch
# PyTorch-1.9.0_increase-test-cuda-tolerance.patch
'73de855ab1ed38043c7fb2a983927786b83d7547aefed926f19e554e2214838a',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ patches = [
'PyTorch-1.9.0_fix-min-amount-of-devices-for-test.patch',
'PyTorch-1.9.0_fix-testnn-on-A100.patch',
'PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch',
'PyTorch-1.9.0_fix-kineto-crash.patch',
'PyTorch-1.9.0_fix-vsx-vector-functions.patch',
'PyTorch-1.9.0_increase-test-cuda-tolerance.patch',
'PyTorch-1.9.0_increase-tolerance-for-distributed-tests.patch',
Expand Down Expand Up @@ -63,6 +64,7 @@ checksums = [
'8e8b417782e2f3004462c32338e12685e7296d15207f3e3087dcb8015e648f98', # PyTorch-1.9.0_fix-testnn-on-A100.patch
# PyTorch-1.9.0_fix-use-after-destruct-in-cudaipctypes.patch
'67960bf9140baf004b07e29f7c2b338e7bc4e4e4f2c931768be44f58526e605f',
'1ed5e125f7922ea577d43053a6652aedc21cc036157e101c0e3b9aee9029d3b0', # PyTorch-1.9.0_fix-kineto-crash.patch
'a4733b6b16a0db4ee5f85f2b103abc29bd711cfc5253f8dd8494d2b0c1509516', # PyTorch-1.9.0_fix-vsx-vector-functions.patch
# PyTorch-1.9.0_increase-test-cuda-tolerance.patch
'73de855ab1ed38043c7fb2a983927786b83d7547aefed926f19e554e2214838a',
Expand Down Expand Up @@ -111,6 +113,10 @@ excluded_tests = {
# Bad tests: https://github.com/pytorch/pytorch/issues/60260
'distributed/elastic/utils/distributed_test',
'distributed/elastic/multiprocessing/api_test',
# These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
# Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
'distributed/test_distributed_fork',
'distributed/test_distributed_spawn',
# Test from this suite timeout often. The process group backend is deprecated anyway
'distributed/rpc/test_process_group_agent',
]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
Fix a crash during application shutdown visible in test_profiler on some machines.
See https://github.com/pytorch/kineto/pull/642

Author: Alexander Grund (TU Dresden)

diff -aur a/third_party/kineto/libkineto/src/EventProfilerController.cpp b/third_party/kineto/libkineto/src/EventProfilerController.cpp
--- a/third_party/kineto/libkineto/src/EventProfilerController.cpp 2022-08-05 13:10:46.175716618 +0200
+++ b/third_party/kineto/libkineto/src/EventProfilerController.cpp 2022-08-05 13:16:00.654118490 +0200
@@ -231,9 +231,14 @@

// Must be called under lock
void EventProfilerController::start(CUcontext ctx) {
- profilerMap()[ctx] = unique_ptr<EventProfilerController>(
+ // Avoid static initialization order fiasco:
+ // We need the profilerMap and with it all controllers to be destroyed
+ // before everything the controller accesses gets destroyed.
+ // Hence access the profilerMap after initialization of the controller.
+ auto controller = unique_ptr<EventProfilerController>(
new EventProfilerController(
ctx, ConfigLoader::instance(), detail::HeartbeatMonitor::instance()));
+ profilerMap()[ctx] = std::move(controller);
}

// Must be called under lock

0 comments on commit ccef9b2

Please sign in to comment.