[1/N] Remove inclusion of ATen/core/Array.h (pytorch#122064)

The functionality of Array.h is largely overlapped with std::array and it should be safe to use std::array instead. Pull Request resolved: pytorch#122064 Approved by: https://github.com/ezyang
Xia-Weiwen · Nov 18, 2024 · 06dde8c · 06dde8c
1 parent 6c6f745
commit 06dde8c
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 20 deletions.
diff --git a/aten/src/ATen/core/PhiloxRNGEngine.h b/aten/src/ATen/core/PhiloxRNGEngine.h
@@ -11,6 +11,7 @@
 #include <cuda.h>
 #endif
 
+#include <array>
 #include <ATen/core/Array.h>
 #include <c10/macros/Macros.h>
 #include <cmath>
@@ -21,10 +22,10 @@ namespace at {
 // typedefs for holding vector data
 namespace detail {
 
-typedef at::detail::Array<uint32_t, 4> UINT4;
-typedef at::detail::Array<uint32_t, 2> UINT2;
-typedef at::detail::Array<double, 2> DOUBLE2;
-typedef at::detail::Array<float, 2> FLOAT2;
+typedef std::array<uint32_t, 4> UINT4;
+typedef std::array<uint32_t, 2> UINT2;
+typedef std::array<double, 2> DOUBLE2;
+typedef std::array<float, 2> FLOAT2;
 
 } // namespace detail
 
@@ -79,7 +80,7 @@ class philox_engine {
                                  uint64_t subsequence = 0) {
     key_[0] = static_cast<uint32_t>(seed);
     key_[1] = static_cast<uint32_t>(seed >> 32);
-    counter_ = detail::UINT4(0);
+    counter_ = detail::UINT4{};
     counter_[2] = static_cast<uint32_t>(subsequence);
     counter_[3] = static_cast<uint32_t>(subsequence >> 32);
     STATE = 0;

diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -296,7 +296,7 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
   TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
   TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
 
-  at::detail::Array<char*, ntensors> data;
+  std::array<char*, ntensors> data;
   for (int i = 0; i < ntensors; i++) {
     data[i] = (char*)iter.data_ptr(i);
   }
@@ -313,7 +313,7 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
   launch_legacy_kernel<128, unroll_factor>(numel, [=] GPU_LAMBDA(int idx) {
     auto offsets = offset_calc.get(idx);
     arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
-    *out = invoke(f, &data.data[1], &offsets.data[1], 1);
+    *out = invoke(f, &data[1], &offsets[1], 1);
   });
 }
 
@@ -330,7 +330,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
   TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
   TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
 
-  at::detail::Array<char*, ntensors> data;
+  std::array<char*, ntensors> data;
   for (int i = 0; i < ntensors; i++) {
     data[i] = (char*)iter.data_ptr(i);
   }
@@ -341,16 +341,16 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
 
   if (contiguous) {
 #ifdef USE_ROCM
-    at::detail::Array<ScalarType, ntensors> dtypes;
+    std::array<ScalarType, ntensors> dtypes;
     auto inner_strides = iter.get_inner_strides();
-    at::detail::Array<int, ntensors> strides;
+    std::array<int, ntensors> strides;
     for (int i = 0; i < ntensors; i++) {
       dtypes[i] = iter.dtype(i);
       strides[i] = inner_strides[i];
     }
     launch_legacy_kernel<512, 1>(numel, [=]GPU_LAMBDA(int idx) {
       void* out = data[0] + strides[0] * idx;
-      arg0_t result = invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx);
+      arg0_t result = invoke(f, &data[1], &strides[1], &dtypes[1], idx);
       c10::cast_and_store<arg0_t>(dtypes[0], out, result);
     });
 #else
@@ -368,15 +368,15 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
         storer);
 #endif
   } else {
-    at::detail::Array<ScalarType, ntensors> dtypes;
+    std::array<ScalarType, ntensors> dtypes;
     for (int i = 0; i < ntensors; i++) {
       dtypes[i] = iter.dtype(i);
     }
     auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
     launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
       auto offsets = offset_calc.get(idx);
       void* out = data[0] + offsets[0];
-      arg0_t result = invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1);
+      arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
       c10::cast_and_store<arg0_t>(dtypes[0], out, result);
     });
   }

diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu
@@ -2,6 +2,7 @@
 #include <ATen/native/cuda/IndexKernel.h>
 #include <ATen/native/IndexKernel.h>
 
+#include <array>
 #include <type_traits>
 #include <ATen/core/TensorBase.h>
 #include <ATen/Dispatch.h>
@@ -68,9 +69,9 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
     return;
   }
 
-  auto sizes = at::detail::Array<int64_t, MAX_DIMS>(0);
-  auto strides = at::detail::Array<int64_t, MAX_DIMS>(0);
-  auto index_ptrs = at::detail::Array<char*, MAX_DIMS>(nullptr);
+  auto sizes = std::array<int64_t, MAX_DIMS>{};
+  auto strides = std::array<int64_t, MAX_DIMS>{};
+  auto index_ptrs = std::array<char*, MAX_DIMS>{};
   for (unsigned i = 0; i < num_indices; i++) {
     sizes[i] = index_size[i];
     strides[i] = index_stride[i];

diff --git a/aten/src/ATen/native/cuda/SpectralOps.cu b/aten/src/ATen/native/cuda/SpectralOps.cu
@@ -7,8 +7,8 @@
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/native/SpectralOpsUtils.h>
 
+#include <array>
 #include <cmath>
-#include <vector>
 
 
 namespace at::native {
@@ -17,7 +17,7 @@ namespace at::native {
 // In mirrored dims, maps linear index i to (n - i) % n
 template <typename index_t>
 struct HermitianSymmetryOffsetCalculator {
-  using offset_type = at::detail::Array<index_t, 1>;
+  using offset_type = std::array<index_t, 1>;
   using dim_type = std::remove_cv_t<decltype(MAX_DIMS)>;
   dim_type dims;
   at::cuda::detail::IntDivider<index_t> sizes_[MAX_DIMS];

diff --git a/aten/src/ATen/test/cuda_vectorized_test.cu b/aten/src/ATen/test/cuda_vectorized_test.cu
@@ -1,9 +1,9 @@
+#include <array>
 #include <gtest/gtest.h>
 #include <ATen/ATen.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/cuda/MemoryAccess.cuh>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/core/Array.h>
 
 using namespace at::native;
 using namespace at::native::memory;
@@ -77,7 +77,7 @@ TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
 template <typename scalar_t, int vec_size>
 __global__ void vectorized_copy(scalar_t *dst, scalar_t *src) {
   static_assert(vec_size <= thread_work_size() && thread_work_size() % vec_size == 0, "Invalid vec_size");
-  using array_t = at::detail::Array<char*, 2>;
+  using array_t = std::array<char*, 2>;
   array_t data;
   data[0] = reinterpret_cast<char *>(dst);
   data[1] = reinterpret_cast<char *>(src);