Skip to content

Commit

Permalink
[1/N] Remove inclusion of ATen/core/Array.h (pytorch#122064)
Browse files Browse the repository at this point in the history
The functionality of Array.h is largely overlapped with std::array and it should be safe to use std::array instead.

Pull Request resolved: pytorch#122064
Approved by: https://github.com/ezyang
  • Loading branch information
cyyever authored and pytorchmergebot committed Nov 18, 2024
1 parent 6c6f745 commit 06dde8c
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 20 deletions.
11 changes: 6 additions & 5 deletions aten/src/ATen/core/PhiloxRNGEngine.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <cuda.h>
#endif

#include <array>
#include <ATen/core/Array.h>
#include <c10/macros/Macros.h>
#include <cmath>
Expand All @@ -21,10 +22,10 @@ namespace at {
// typedefs for holding vector data
namespace detail {

typedef at::detail::Array<uint32_t, 4> UINT4;
typedef at::detail::Array<uint32_t, 2> UINT2;
typedef at::detail::Array<double, 2> DOUBLE2;
typedef at::detail::Array<float, 2> FLOAT2;
typedef std::array<uint32_t, 4> UINT4;
typedef std::array<uint32_t, 2> UINT2;
typedef std::array<double, 2> DOUBLE2;
typedef std::array<float, 2> FLOAT2;

} // namespace detail

Expand Down Expand Up @@ -79,7 +80,7 @@ class philox_engine {
uint64_t subsequence = 0) {
key_[0] = static_cast<uint32_t>(seed);
key_[1] = static_cast<uint32_t>(seed >> 32);
counter_ = detail::UINT4(0);
counter_ = detail::UINT4{};
counter_[2] = static_cast<uint32_t>(subsequence);
counter_[3] = static_cast<uint32_t>(subsequence >> 32);
STATE = 0;
Expand Down
16 changes: 8 additions & 8 deletions aten/src/ATen/native/cuda/CUDALoops.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));

at::detail::Array<char*, ntensors> data;
std::array<char*, ntensors> data;
for (int i = 0; i < ntensors; i++) {
data[i] = (char*)iter.data_ptr(i);
}
Expand All @@ -313,7 +313,7 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
launch_legacy_kernel<128, unroll_factor>(numel, [=] GPU_LAMBDA(int idx) {
auto offsets = offset_calc.get(idx);
arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
*out = invoke(f, &data.data[1], &offsets.data[1], 1);
*out = invoke(f, &data[1], &offsets[1], 1);
});
}

Expand All @@ -330,7 +330,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);

at::detail::Array<char*, ntensors> data;
std::array<char*, ntensors> data;
for (int i = 0; i < ntensors; i++) {
data[i] = (char*)iter.data_ptr(i);
}
Expand All @@ -341,16 +341,16 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {

if (contiguous) {
#ifdef USE_ROCM
at::detail::Array<ScalarType, ntensors> dtypes;
std::array<ScalarType, ntensors> dtypes;
auto inner_strides = iter.get_inner_strides();
at::detail::Array<int, ntensors> strides;
std::array<int, ntensors> strides;
for (int i = 0; i < ntensors; i++) {
dtypes[i] = iter.dtype(i);
strides[i] = inner_strides[i];
}
launch_legacy_kernel<512, 1>(numel, [=]GPU_LAMBDA(int idx) {
void* out = data[0] + strides[0] * idx;
arg0_t result = invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx);
arg0_t result = invoke(f, &data[1], &strides[1], &dtypes[1], idx);
c10::cast_and_store<arg0_t>(dtypes[0], out, result);
});
#else
Expand All @@ -368,15 +368,15 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
storer);
#endif
} else {
at::detail::Array<ScalarType, ntensors> dtypes;
std::array<ScalarType, ntensors> dtypes;
for (int i = 0; i < ntensors; i++) {
dtypes[i] = iter.dtype(i);
}
auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
auto offsets = offset_calc.get(idx);
void* out = data[0] + offsets[0];
arg0_t result = invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1);
arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
c10::cast_and_store<arg0_t>(dtypes[0], out, result);
});
}
Expand Down
7 changes: 4 additions & 3 deletions aten/src/ATen/native/cuda/IndexKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <ATen/native/cuda/IndexKernel.h>
#include <ATen/native/IndexKernel.h>

#include <array>
#include <type_traits>
#include <ATen/core/TensorBase.h>
#include <ATen/Dispatch.h>
Expand Down Expand Up @@ -68,9 +69,9 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
return;
}

auto sizes = at::detail::Array<int64_t, MAX_DIMS>(0);
auto strides = at::detail::Array<int64_t, MAX_DIMS>(0);
auto index_ptrs = at::detail::Array<char*, MAX_DIMS>(nullptr);
auto sizes = std::array<int64_t, MAX_DIMS>{};
auto strides = std::array<int64_t, MAX_DIMS>{};
auto index_ptrs = std::array<char*, MAX_DIMS>{};
for (unsigned i = 0; i < num_indices; i++) {
sizes[i] = index_size[i];
strides[i] = index_stride[i];
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/native/cuda/SpectralOps.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
#include <ATen/detail/CUDAHooksInterface.h>
#include <ATen/native/SpectralOpsUtils.h>

#include <array>
#include <cmath>
#include <vector>


namespace at::native {
Expand All @@ -17,7 +17,7 @@ namespace at::native {
// In mirrored dims, maps linear index i to (n - i) % n
template <typename index_t>
struct HermitianSymmetryOffsetCalculator {
using offset_type = at::detail::Array<index_t, 1>;
using offset_type = std::array<index_t, 1>;
using dim_type = std::remove_cv_t<decltype(MAX_DIMS)>;
dim_type dims;
at::cuda::detail::IntDivider<index_t> sizes_[MAX_DIMS];
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/test/cuda_vectorized_test.cu
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#include <array>
#include <gtest/gtest.h>
#include <ATen/ATen.h>
#include <ATen/native/cuda/Loops.cuh>
#include <ATen/native/cuda/MemoryAccess.cuh>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/core/Array.h>

using namespace at::native;
using namespace at::native::memory;
Expand Down Expand Up @@ -77,7 +77,7 @@ TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
template <typename scalar_t, int vec_size>
__global__ void vectorized_copy(scalar_t *dst, scalar_t *src) {
static_assert(vec_size <= thread_work_size() && thread_work_size() % vec_size == 0, "Invalid vec_size");
using array_t = at::detail::Array<char*, 2>;
using array_t = std::array<char*, 2>;
array_t data;
data[0] = reinterpret_cast<char *>(dst);
data[1] = reinterpret_cast<char *>(src);
Expand Down

0 comments on commit 06dde8c

Please sign in to comment.