Fix PyTorch-1.12.1-foss-2022a (CUDA) on POWER

easybuilders · Aug 8, 2023 · 69c9d48 · 69c9d48
1 parent fbc7a74
commit 69c9d48
Show file tree

Hide file tree

Showing 3 changed files with 79 additions and 0 deletions.
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch
@@ -0,0 +1,25 @@
+Fix use-after free leading to random failures in nn/test_embedding
+on e.g. POWER platforms where FBGEMM isn't used
+
+From https://github.com/pytorch/pytorch/pull/84750
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+index 224a66f8abf..f4d018007bf 100644
+--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
++++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp
+@@ -252,9 +252,10 @@ Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) {
+   }
+
+ #else
+-  const auto weight_data = weight_contig->scalar_type() == at::ScalarType::Half
+-      ? weight_contig->to(at::ScalarType::Float).data_ptr<float>()
+-      : weight_contig->data_ptr<float>();
++  const Tensor& float_weight = weight_contig->scalar_type() == at::ScalarType::Half
++    ? weight_contig->to(at::ScalarType::Float)
++    : *weight_contig;
++  const auto weight_data = float_weight.data_ptr<float>();
+   constexpr float kEpsilon = 1e-8f;
+   for (auto row : c10::irange(embedding_rows)) {
+     const float* input_row = weight_data + row * embedding_cols;
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch
@@ -0,0 +1,48 @@
+There is a bug in the fallback path for the case where FBGEMM isn't available (e.g. on POWER)
+which leads to a race condition:
+Data is "copied" for the full buffer while it is processed in chunks by different threads.
+This a) duplicates the work and b) might write incomplete/wrong data to the output.
+
+Found in failing test_embedding_bag_half_cpu_* of nn/test_embedding:
+ERROR: test_embedding_bag_half_cpu_int32_int32 (__main__.TestEmbeddingNNDeviceTypeCPU)
+----------------------------------------------------------------------
+Traceback (most recent call last):
+  File "/dev/shm/s3248973-EasyBuild/PyTorch/1.13.1/foss-2022a/pytorch-v1.13.1/test/nn/test_embedding.py", line 936, in _test_EmbeddingBag_vs_Embedding
+    self.assertEqual(output, ref_output, atol=dtype2prec_DONTUSE[wdtype], rtol=0)
+  File "/tmp/eb-tmp-2022a/lib/python3.10/site-packages/torch/testing/_internal/common_utils.py", line 2470, in assertEqual
+    assert_equal(
+  File "/tmp/eb-tmp-2022a/lib/python3.10/site-packages/torch/testing/_comparison.py", line 1093, in assert_equal
+    raise error_metas[0].to_error(msg)
+AssertionError: Tensor-likes are not close!
+
+Mismatched elements: 1 / 4 (25.0%)
+Greatest absolute difference: 1.18359375 at index (1, 1) (up to 0.01 allowed)
+Greatest relative difference: 1.0 at index (1, 1) (up to 0 allowed)
+
+
+Introduced by https://github.com/pytorch/pytorch/pull/74844
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp
+index 6d8cea26f52..604ea16bace 100644
+--- a/aten/src/ATen/native/EmbeddingBag.cpp
++++ b/aten/src/ATen/native/EmbeddingBag.cpp
+@@ -246,7 +246,7 @@ index_select_add(const Tensor &select_indices,
+               /*scale_bias=*/nullptr,
+               /*normalize_by_lengths=*/false,
+               /*out=*/output_data_fp32 + start_idx * ddim);
+-          for (const auto i : c10::irange(output_size)) {
++          for (const auto i : c10::irange(start_idx, end_idx)) {
+             // Convert FP32 intermediate buffer result back to FP16 for output dtype
+             for (const auto d : c10::irange(ddim)) {
+               (output_data + i * ddim)[d] = static_cast<at::Half>((output_data_fp32 + ddim * i)[d]);
+@@ -590,7 +590,7 @@ index_select_scale_add(const Tensor &select_indices,
+               /*scale_bias=*/nullptr,
+               /*normalize_by_lengths=*/false,
+               /*out=*/output_data_fp32 + start_idx * ddim);
+-          for (const auto i : c10::irange(output_size)) {
++          for (const auto i : c10::irange(start_idx, end_idx)) {
+             // Convert FP32 intermediate buffer result back to FP16 for output dtype
+             for (const auto d : c10::irange(ddim)) {
+               (output_data + i * ddim)[d] = static_cast<at::Half>((output_data_fp32 + ddim * i)[d]);
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2022a-CUDA-11.7.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2022a-CUDA-11.7.0.eb
@@ -19,12 +19,14 @@ patches = [
     'PyTorch-1.10.0_fix-test-model_dump.patch',
     'PyTorch-1.10.0_fix-vsx-vector-functions.patch',
     'PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch',
+    'PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch',
     'PyTorch-1.11.0_fix-fsdp-fp16-test.patch',
     'PyTorch-1.11.0_fix-test_utils.patch',
     'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch',
     'PyTorch-1.11.0_increase-distributed-test-timeout.patch',
     'PyTorch-1.11.0_install-vsx-vec-headers.patch',
     'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
+    'PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch',
     'PyTorch-1.12.1_add-hypothesis-suppression.patch',
     'PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch',
     'PyTorch-1.12.1_fix-cuda-gcc-version-check.patch',
@@ -60,6 +62,8 @@ checksums = [
     '7bef5f96cb83b2d655d2f76dd7468a171d446f0b3e06da2232ec7f886484d312',
     # PyTorch-1.10.0_skip-nnapi-test-without-qnnpack.patch
     '34ba476a7bcddec323bf9eca083cb4623d0f569d081aa3add3769c24f22849d2',
+    {'PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch':
+     'cc526130b6446bbbf5f0f7372d3aeee3e7d4c4d6e471524dff028b430b152934'},
     'bb1c4e6d6fd4b0cf57ff8b824c797331b533bb1ffc63f5db0bae3aee10c3dc13',  # PyTorch-1.11.0_fix-fsdp-fp16-test.patch
     '4f7e25c4e2eb7094f92607df74488c6a4a35849fabf05fcf6c3655fa3f44a861',  # PyTorch-1.11.0_fix-test_utils.patch
     # PyTorch-1.11.0_increase_c10d_gloo_timeout.patch
@@ -69,6 +73,8 @@ checksums = [
     'f2e6b9625733d9a471bb75e1ea20e28814cf1380b4f9089aa838ee35ddecf07d',  # PyTorch-1.11.0_install-vsx-vec-headers.patch
     # PyTorch-1.11.1_skip-test_init_from_local_shards.patch
     '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7',
+    {'PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch':
+     '090598592283e3fc46ee08a68b6a6afe07be41b26514afba51834408bf1c98ed'},
     # PyTorch-1.12.1_add-hypothesis-suppression.patch
     'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c',
     # PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch