Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cherrypick NV fixes to release/2.4 #48263

Merged
1 change: 1 addition & 0 deletions paddle/fluid/framework/distributed_strategy.proto
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ message BuildStrategy {
optional bool allow_cuda_graph_capture = 14 [ default = false ];
optional int32 reduce_strategy = 15 [ default = 0 ];
optional bool fuse_gemm_epilogue = 16 [ default = false ];
optional string debug_graphviz_path = 17;
}

message ExecutionStrategy {
Expand Down
8 changes: 8 additions & 0 deletions paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,14 @@ struct NormConvolutionArgs {
int stride,
int dilation,
int group) {
PADDLE_ENFORCE_LT(
ctx.GetComputeCapability(),
90,
phi::errors::PreconditionNotMet(
"Expect compute compatiblity to be less than 90, but got %d. "
"CUDNN FusedOps is no longer available on H100 and later "
"devices.",
ctx.GetComputeCapability()));
PADDLE_ENFORCE_EQ(
input_shape.size(),
4U,
Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ TEST(CudnnNormConvFp16, K1S1) {
phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));

if (ctx->GetComputeCapability() < 70) {
if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
ASSERT_THROW(test.CheckForward(1e-3, true),
paddle::platform::EnforceNotMet);
ASSERT_THROW(test.CheckBackward(1e-3, true),
Expand Down Expand Up @@ -472,7 +472,7 @@ TEST(CudnnNormConvFp16, K3S1) {
phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));

if (ctx->GetComputeCapability() < 70) {
if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
ASSERT_THROW(test.CheckForward(1e-3, true),
paddle::platform::EnforceNotMet);
ASSERT_THROW(test.CheckBackward(1e-3, true),
Expand Down Expand Up @@ -502,7 +502,7 @@ TEST(CudnnNormConvFp16, K1S1O4) {
phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));

if (ctx->GetComputeCapability() < 70) {
if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
ASSERT_THROW(test.CheckForward(1e-3, true),
paddle::platform::EnforceNotMet);
ASSERT_THROW(test.CheckBackward(1e-3, true),
Expand Down Expand Up @@ -532,7 +532,7 @@ TEST(CudnnNormConvFp16, K1S2O4) {
phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));

if (ctx->GetComputeCapability() <= 70) {
if (ctx->GetComputeCapability() <= 70 || ctx->GetComputeCapability() >= 90) {
ASSERT_THROW(test.CheckForward(1e-3, true),
paddle::platform::EnforceNotMet);
ASSERT_THROW(test.CheckBackward(1e-3), paddle::platform::EnforceNotMet);
Expand Down
24 changes: 13 additions & 11 deletions paddle/fluid/operators/fused/fused_dropout_act_bias.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,17 +256,19 @@ template <typename T,
int BlockSizeX,
int BlockSizeY,
int VecSize,
typename Functor>
__global__ void FusedDropoutActBiasGrad(Functor act_grad,
const T *dout,
const MaskType *mask,
const T *src,
const T *bias,
const T factor,
const int64_t rows,
const int64_t cols,
T *dx,
T *dbias) {
typename Functor,
int THREADS_PER_CTA = BlockSizeX *BlockSizeY>
__global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad(
Functor act_grad,
const T *dout,
const MaskType *mask,
const T *src,
const T *bias,
const T factor,
const int64_t rows,
const int64_t cols,
T *dx,
T *dbias) {
int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;

using LoadT = phi::AlignedVector<T, VecSize>;
Expand Down
124 changes: 81 additions & 43 deletions python/paddle/fluid/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@
if os.name == 'nt':
third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
# Will load shared library from 'path' on windows
os.environ[
'path'] = current_path + ';' + third_lib_path + ';' + os.environ[
'path']
os.environ['path'] = (
current_path + ';' + third_lib_path + ';' + os.environ['path']
)
sys.path.insert(0, third_lib_path)
# Note: from python3.8, PATH will not take effect
# https://github.com/python/cpython/pull/12302
Expand All @@ -47,20 +47,24 @@

except ImportError as e:
from .. import compat as cpt

if os.name == 'nt':
executable_path = os.path.abspath(os.path.dirname(sys.executable))
raise ImportError(
"""NOTE: You may need to run \"set PATH=%s;%%PATH%%\"
if you encounters \"DLL load failed\" errors. If you have python
installed in other directory, replace \"%s\" with your own
directory. The original error is: \n %s""" %
(executable_path, executable_path, cpt.get_exception_message(e)))
directory. The original error is: \n %s"""
% (executable_path, executable_path, cpt.get_exception_message(e))
)
else:
raise ImportError(
"""NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
if you encounters \"libmkldnn.so not found\" errors. If you have python
installed in other directory, replace \"/usr/local/lib\" with your own
directory. The original error is: \n""" + cpt.get_exception_message(e))
directory. The original error is: \n"""
+ cpt.get_exception_message(e)
)
except Exception as e:
raise e

Expand All @@ -70,36 +74,45 @@ def avx_supported():
Whether current system(Linux, MacOS, Windows) is supported with AVX.
"""
from .. import compat as cpt

sysstr = platform.system().lower()
has_avx = False
if sysstr == 'linux':
try:
has_avx = os.popen('cat /proc/cpuinfo | grep -i avx').read() != ''
pipe = os.popen('cat /proc/cpuinfo | grep -i avx')
has_avx = pipe.read() != ''
pipe.close()
except Exception as e:
sys.stderr.write('Can not get the AVX flag from /proc/cpuinfo.\n'
'The original error is: %s\n' %
cpt.get_exception_message(e))
sys.stderr.write(
'Can not get the AVX flag from /proc/cpuinfo.\n'
'The original error is: %s\n' % cpt.get_exception_message(e)
)
return has_avx
elif sysstr == 'darwin':
try:
has_avx = os.popen(
'sysctl machdep.cpu.features | grep -i avx').read() != ''
pipe = os.popen('sysctl machdep.cpu.features | grep -i avx')
has_avx = pipe.read() != ''
pipe.close()
except Exception as e:
sys.stderr.write(
'Can not get the AVX flag from machdep.cpu.features.\n'
'The original error is: %s\n' % cpt.get_exception_message(e))
'The original error is: %s\n' % cpt.get_exception_message(e)
)
if not has_avx:
import subprocess

pipe = subprocess.Popen(
'sysctl machdep.cpu.leaf7_features | grep -i avx',
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stderr=subprocess.PIPE,
)
_ = pipe.communicate()
has_avx = True if pipe.returncode == 0 else False
return has_avx
elif sysstr == 'windows':
import ctypes

ONE_PAGE = ctypes.c_size_t(0x1000)

def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
Expand All @@ -109,24 +122,31 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
pfnVirtualAlloc.restype = ctypes.c_void_p
MEM_COMMIT = ctypes.c_ulong(0x1000)
PAGE_READWRITE = ctypes.c_ulong(0x4)
address = pfnVirtualAlloc(None, ONE_PAGE, MEM_COMMIT,
PAGE_READWRITE)
address = pfnVirtualAlloc(
None, ONE_PAGE, MEM_COMMIT, PAGE_READWRITE
)
if not address:
raise Exception("Failed to VirtualAlloc")

# Copy the code into the memory segment
memmove = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_size_t)(ctypes._memmove_addr)
memmove = ctypes.CFUNCTYPE(
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_size_t,
)(ctypes._memmove_addr)
if memmove(address, code_str, len(code_str)) < 0:
raise Exception("Failed to memmove")

# Enable execute permissions
PAGE_EXECUTE = ctypes.c_ulong(0x10)
pfnVirtualProtect = ctypes.windll.kernel32.VirtualProtect
res = pfnVirtualProtect(ctypes.c_void_p(address),
ONE_PAGE, PAGE_EXECUTE,
ctypes.byref(ctypes.c_ulong(0)))
res = pfnVirtualProtect(
ctypes.c_void_p(address),
ONE_PAGE,
PAGE_EXECUTE,
ctypes.byref(ctypes.c_ulong(0)),
)
if not res:
raise Exception("Failed VirtualProtect")

Expand All @@ -135,7 +155,8 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
pfnGetCurrentProcess.restype = ctypes.c_void_p
prochandle = ctypes.c_void_p(pfnGetCurrentProcess())
res = ctypes.windll.kernel32.FlushInstructionCache(
prochandle, ctypes.c_void_p(address), ONE_PAGE)
prochandle, ctypes.c_void_p(address), ONE_PAGE
)
if not res:
raise Exception("Failed FlushInstructionCache")

Expand All @@ -153,12 +174,14 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
# Convert the code_str into a function that returns uint
func, address = asm_func(code_str)
retval = func()
ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(address),
ctypes.c_size_t(0), ONE_PAGE)
ctypes.windll.kernel32.VirtualFree(
ctypes.c_void_p(address), ctypes.c_size_t(0), ONE_PAGE
)
except Exception as e:
sys.stderr.write('Failed getting the AVX flag on Windows.\n'
'The original error is: %s\n' %
cpt.get_exception_message(e))
sys.stderr.write(
'Failed getting the AVX flag on Windows.\n'
'The original error is: %s\n' % cpt.get_exception_message(e)
)
return (retval & (1 << avx_bit)) > 0
else:
sys.stderr.write('Do not get AVX flag on %s\n' % sysstr)
Expand All @@ -167,10 +190,10 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):

def run_shell_command(cmd):
import subprocess
out, err = subprocess.Popen(cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
shell=True).communicate()

out, err = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
).communicate()
if err:
return None
else:
Expand All @@ -179,8 +202,9 @@ def run_shell_command(cmd):

def get_dso_path(core_so, dso_name):
if core_so and dso_name:
return run_shell_command("ldd %s|grep %s|awk '{print $3}'" %
(core_so, dso_name))
return run_shell_command(
"ldd %s|grep %s|awk '{print $3}'" % (core_so, dso_name)
)
else:
return None

Expand All @@ -189,6 +213,7 @@ def load_dso(dso_absolute_path):
if dso_absolute_path:
try:
from ctypes import cdll

cdll.LoadLibrary(dso_absolute_path)
except:
warnings.warn("Load {} failed".format(dso_absolute_path))
Expand Down Expand Up @@ -247,12 +272,14 @@ def to_list(s):

try:
from . import libpaddle

if avx_supported() and not libpaddle.is_compiled_with_avx():
sys.stderr.write(
"Hint: Your machine support AVX, but the installed paddlepaddle doesn't have avx core. "
"Hence, no-avx core with worse preformance will be imported.\nIf you like, you could "
"reinstall paddlepaddle by 'python -m pip install --force-reinstall paddlepaddle-gpu[==version]' "
"to get better performance.\n")
"to get better performance.\n"
)

# assign tensor alias
libpaddle.LoDTensor = libpaddle.Tensor
Expand Down Expand Up @@ -283,6 +310,7 @@ def to_list(s):
from .libpaddle import _Profiler, _ProfilerResult, _RecordEvent
from .libpaddle import _set_current_stream
from .libpaddle import _get_phi_kernel_name

if sys.platform != 'win32':
from .libpaddle import _set_process_pids
from .libpaddle import _erase_process_pids
Expand All @@ -295,12 +323,18 @@ def to_list(s):
except Exception as e:
if has_paddle_dy_lib:
sys.stderr.write(
'Error: Can not import paddle core while this file exists: ' +
current_path + os.sep + 'libpaddle.' + dy_lib_suffix + '\n')
'Error: Can not import paddle core while this file exists: '
+ current_path
+ os.sep
+ 'libpaddle.'
+ dy_lib_suffix
+ '\n'
)
if not avx_supported() and libpaddle.is_compiled_with_avx():
sys.stderr.write(
"Error: Your machine doesn't support AVX, but the installed PaddlePaddle is avx core, "
"you should reinstall paddlepaddle with no-avx core.\n")
"you should reinstall paddlepaddle with no-avx core.\n"
)
raise e


Expand All @@ -317,22 +351,26 @@ def set_paddle_custom_device_lib_path(lib_path):

# set paddle lib path
def set_paddle_lib_path():
site_dirs = site.getsitepackages() if hasattr(
site,
'getsitepackages') else [x for x in sys.path if 'site-packages' in x]
site_dirs = (
site.getsitepackages()
if hasattr(site, 'getsitepackages')
else [x for x in sys.path if 'site-packages' in x]
)
for site_dir in site_dirs:
lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
if os.path.exists(lib_dir):
_set_paddle_lib_path(lib_dir)
set_paddle_custom_device_lib_path(
os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins'])
)
return
if hasattr(site, 'USER_SITE'):
lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs'])
if os.path.exists(lib_dir):
_set_paddle_lib_path(lib_dir)
set_paddle_custom_device_lib_path(
os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins'])
)


set_paddle_lib_path()
Loading