From af140b6ac424642fda06b01cc437cdda39606f4f Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 12 Nov 2020 21:34:27 +0000 Subject: [PATCH 01/11] Rename files. --- torchvision/csrc/{DeformConv.h => DeformConv2d.h} | 0 .../csrc/cpu/{DeformConv_cpu.cpp => DeformConv2d_cpu.cpp} | 0 .../csrc/cuda/{DeformConv_cuda.cu => DeformConv2d_cuda.cu} | 0 torchvision/csrc/vision.cpp | 2 +- 4 files changed, 1 insertion(+), 1 deletion(-) rename torchvision/csrc/{DeformConv.h => DeformConv2d.h} (100%) rename torchvision/csrc/cpu/{DeformConv_cpu.cpp => DeformConv2d_cpu.cpp} (100%) rename torchvision/csrc/cuda/{DeformConv_cuda.cu => DeformConv2d_cuda.cu} (100%) diff --git a/torchvision/csrc/DeformConv.h b/torchvision/csrc/DeformConv2d.h similarity index 100% rename from torchvision/csrc/DeformConv.h rename to torchvision/csrc/DeformConv2d.h diff --git a/torchvision/csrc/cpu/DeformConv_cpu.cpp b/torchvision/csrc/cpu/DeformConv2d_cpu.cpp similarity index 100% rename from torchvision/csrc/cpu/DeformConv_cpu.cpp rename to torchvision/csrc/cpu/DeformConv2d_cpu.cpp diff --git a/torchvision/csrc/cuda/DeformConv_cuda.cu b/torchvision/csrc/cuda/DeformConv2d_cuda.cu similarity index 100% rename from torchvision/csrc/cuda/DeformConv_cuda.cu rename to torchvision/csrc/cuda/DeformConv2d_cuda.cu diff --git a/torchvision/csrc/vision.cpp b/torchvision/csrc/vision.cpp index abfd78c5461..f43e18db461 100644 --- a/torchvision/csrc/vision.cpp +++ b/torchvision/csrc/vision.cpp @@ -8,7 +8,7 @@ #include #endif -#include "DeformConv.h" +#include "DeformConv2d.h" #include "PSROIAlign.h" #include "PSROIPool.h" #include "ROIAlign.h" From 5946432d4f63a0b8154f741ec268d10b77033162 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 12 Nov 2020 21:57:39 +0000 Subject: [PATCH 02/11] Standardizing method names. --- torchvision/csrc/cpu/DeformConv2d_cpu.cpp | 9 ++++----- torchvision/csrc/cuda/DeformConv2d_cuda.cu | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/torchvision/csrc/cpu/DeformConv2d_cpu.cpp b/torchvision/csrc/cpu/DeformConv2d_cpu.cpp index 0212be55aa4..c42efa61a0f 100644 --- a/torchvision/csrc/cpu/DeformConv2d_cpu.cpp +++ b/torchvision/csrc/cpu/DeformConv2d_cpu.cpp @@ -790,8 +790,7 @@ static void compute_grad_offset_and_mask( })); } -static std::tuple -deform_conv2d_backward_input_cpu( +static std::tuple gradient_inputs( at::Tensor input, at::Tensor weight, at::Tensor offset, @@ -944,7 +943,7 @@ deform_conv2d_backward_input_cpu( return std::make_tuple(grad_input, grad_offset, grad_mask); } -static at::Tensor deform_conv2d_backward_parameters_cpu( +static at::Tensor backward_gradient_parameters( at::Tensor input, const at::Tensor& weight, at::Tensor offset, @@ -1086,7 +1085,7 @@ DeformConv2d_backward_cpu( const int n_parallel_imgs = get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); - auto grad_input_and_offset_and_mask = deform_conv2d_backward_input_cpu( + auto grad_input_and_offset_and_mask = gradient_inputs( input, weight, offset, @@ -1107,7 +1106,7 @@ DeformConv2d_backward_cpu( auto grad_offset = std::get<1>(grad_input_and_offset_and_mask); auto grad_mask = std::get<2>(grad_input_and_offset_and_mask); - auto grad_weight = deform_conv2d_backward_parameters_cpu( + auto grad_weight = backward_gradient_parameters( input, weight, offset, diff --git a/torchvision/csrc/cuda/DeformConv2d_cuda.cu b/torchvision/csrc/cuda/DeformConv2d_cuda.cu index c6e9a9278ed..ab4cf64dc43 100644 --- a/torchvision/csrc/cuda/DeformConv2d_cuda.cu +++ b/torchvision/csrc/cuda/DeformConv2d_cuda.cu @@ -133,7 +133,7 @@ __device__ scalar_t bilinear_interpolate( } template -__global__ void deformable_im2col_gpu_kernel( +__global__ void deformable_im2col_kernel( int n, const scalar_t* input_ptr, const scalar_t* offset_ptr, @@ -233,7 +233,7 @@ static void deformable_im2col( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.scalar_type(), "deformable_im2col_gpu", ([&] { - deformable_im2col_gpu_kernel<<< + deformable_im2col_kernel<<< blocks, threads>>>( num_kernels, @@ -484,7 +484,7 @@ at::Tensor DeformConv2d_forward_cuda( } template -__global__ void deformable_col2im_gpu_kernel( +__global__ void deformable_col2im_kernel( int n, const scalar_t* col, const scalar_t* offset_ptr, @@ -588,7 +588,7 @@ static void compute_grad_input( AT_DISPATCH_FLOATING_TYPES_AND_HALF( columns.scalar_type(), "deformable_col2im_gpu", ([&] { - deformable_col2im_gpu_kernel<<< + deformable_col2im_kernel<<< blocks, threads>>>( num_kernels, @@ -654,7 +654,7 @@ __device__ scalar_t get_coordinate_weight( } template -__global__ void deformable_col2im_coord_gpu_kernel( +__global__ void deformable_col2im_coord_kernel( int n, const scalar_t* col_ptr, const scalar_t* im_ptr, @@ -796,7 +796,7 @@ static void compute_grad_offset_and_mask( AT_DISPATCH_FLOATING_TYPES_AND_HALF( columns.scalar_type(), "deformable_col2im_coord_gpu", ([&] { - deformable_col2im_coord_gpu_kernel<<< + deformable_col2im_coord_kernel<<< blocks, threads>>>( num_kernels, @@ -832,7 +832,7 @@ static void compute_grad_offset_and_mask( } } -static std::tuple deform_conv2d_backward_input_cuda( +static std::tuple backward_gradient_inputs( at::Tensor input, at::Tensor weight, at::Tensor offset, @@ -986,7 +986,7 @@ static std::tuple deform_conv2d_backward_inp return std::make_tuple(grad_input, grad_offset, grad_mask); } -static at::Tensor deform_conv2d_backward_parameters_cuda( +static at::Tensor backward_gradient_parameters( at::Tensor input, const at::Tensor& weight, at::Tensor offset, @@ -1130,7 +1130,7 @@ DeformConv2d_backward_cuda( const int n_parallel_imgs = get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); - auto grad_input_and_offset_and_mask = deform_conv2d_backward_input_cuda( + auto grad_input_and_offset_and_mask = backward_gradient_inputs( input, weight, offset, @@ -1151,7 +1151,7 @@ DeformConv2d_backward_cuda( auto grad_offset = std::get<1>(grad_input_and_offset_and_mask); auto grad_mask = std::get<2>(grad_input_and_offset_and_mask); - auto grad_weight = deform_conv2d_backward_parameters_cuda( + auto grad_weight = backward_gradient_parameters( input, weight, offset, From a47ff4adf84a58533f054aae65eedc8ba23d7077 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 12 Nov 2020 22:26:34 +0000 Subject: [PATCH 03/11] Adding anonymous namespaces. --- torchvision/csrc/DeformConv2d.h | 76 ++-- torchvision/csrc/cpu/DeformConv2d_cpu.cpp | 442 +++++++++++---------- torchvision/csrc/cuda/DeformConv2d_cuda.cu | 434 ++++++++++---------- 3 files changed, 481 insertions(+), 471 deletions(-) diff --git a/torchvision/csrc/DeformConv2d.h b/torchvision/csrc/DeformConv2d.h index f8a8dba60e6..1fe1b362b1c 100644 --- a/torchvision/csrc/DeformConv2d.h +++ b/torchvision/csrc/DeformConv2d.h @@ -11,7 +11,7 @@ #include "hip/vision_cuda.h" #endif -// TODO: put this stuff in torchvision namespace +namespace { at::Tensor deform_conv2d( const at::Tensor& input, @@ -48,42 +48,6 @@ at::Tensor deform_conv2d( use_mask); } -#if defined(WITH_CUDA) || defined(WITH_HIP) -at::Tensor DeformConv2d_autocast( - const at::Tensor& input, - const at::Tensor& weight, - const at::Tensor& offset, - const at::Tensor& mask, - const at::Tensor& bias, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dilation_h, - int64_t dilation_w, - int64_t groups, - int64_t offset_groups, - bool use_mask) { - c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast); - return deform_conv2d( - at::autocast::cached_cast(at::kFloat, input), - at::autocast::cached_cast(at::kFloat, weight), - at::autocast::cached_cast(at::kFloat, offset), - at::autocast::cached_cast(at::kFloat, mask), - at::autocast::cached_cast(at::kFloat, bias), - stride_h, - stride_w, - pad_h, - pad_w, - dilation_h, - dilation_w, - groups, - offset_groups, - use_mask) - .to(input.scalar_type()); -} -#endif - std::tuple _deform_conv2d_backward( const at::Tensor& grad, @@ -297,6 +261,44 @@ class DeformConv2dBackwardFunction } }; +} // namespace + +#if defined(WITH_CUDA) || defined(WITH_HIP) +at::Tensor DeformConv2d_autocast( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t groups, + int64_t offset_groups, + bool use_mask) { + c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast); + return deform_conv2d( + at::autocast::cached_cast(at::kFloat, input), + at::autocast::cached_cast(at::kFloat, weight), + at::autocast::cached_cast(at::kFloat, offset), + at::autocast::cached_cast(at::kFloat, mask), + at::autocast::cached_cast(at::kFloat, bias), + stride_h, + stride_w, + pad_h, + pad_w, + dilation_h, + dilation_w, + groups, + offset_groups, + use_mask) + .to(input.scalar_type()); +} +#endif + at::Tensor DeformConv2d_autograd( const at::Tensor& input, const at::Tensor& weight, diff --git a/torchvision/csrc/cpu/DeformConv2d_cpu.cpp b/torchvision/csrc/cpu/DeformConv2d_cpu.cpp index c42efa61a0f..197f5176e95 100644 --- a/torchvision/csrc/cpu/DeformConv2d_cpu.cpp +++ b/torchvision/csrc/cpu/DeformConv2d_cpu.cpp @@ -74,10 +74,12 @@ #include #include +namespace { + const int kMaxParallelImgs = 32; template -static scalar_t bilinear_interpolate( +scalar_t bilinear_interpolate( const scalar_t* in, int height, int width, @@ -116,7 +118,7 @@ static scalar_t bilinear_interpolate( } template -static void deformable_im2col_kernel( +void deformable_im2col_kernel( int n, const scalar_t* input, const scalar_t* offset, @@ -190,7 +192,7 @@ static void deformable_im2col_kernel( } } -static void deformable_im2col( +void deformable_im2col( const at::Tensor& input, const at::Tensor& data_offset, const at::Tensor& data_mask, @@ -240,7 +242,7 @@ static void deformable_im2col( })); } -static int get_greatest_divisor_below_bound(int n, int bound) { +int get_greatest_divisor_below_bound(int n, int bound) { for (int k = bound; k > 1; --k) { if (n % k == 0) { return k; @@ -249,216 +251,8 @@ static int get_greatest_divisor_below_bound(int n, int bound) { return 1; } -at::Tensor DeformConv2d_forward_cpu( - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dil_h, - int64_t dil_w, - int64_t n_weight_grps, - int64_t n_offset_grps, - bool use_mask) { - at::Tensor input = input_param.contiguous(); - at::Tensor offset = offset_param.contiguous(); - at::Tensor weight = weight_param.contiguous(); - at::Tensor mask = mask_param.contiguous(); - at::Tensor bias = bias_param.contiguous(); - - TORCH_CHECK(input.ndimension() == 4); - TORCH_CHECK(offset.ndimension() == 4); - TORCH_CHECK(!use_mask || mask.ndimension() == 4); - TORCH_CHECK(weight.ndimension() == 4); - TORCH_CHECK(input.device().is_cpu(), "input must be a CPU tensor"); - - int batch_sz = input.size(0); - int n_in_channels = input.size(1); - int in_h = input.size(2); - int in_w = input.size(3); - - int n_parallel_imgs = - get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); - - // Unpack shapes and args - int out_channels = weight.size(0); - int weight_h = weight.size(2); - int weight_w = weight.size(3); - - int ker_h = dil_h * (weight_h - 1) + 1; - int ker_w = dil_w * (weight_w - 1) + 1; - int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1; - int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1; - - TORCH_CHECK( - weight_h > 0 && weight_w > 0, - "weight_h: ", - weight_h, - " weight_w: ", - weight_w); - TORCH_CHECK( - stride_h > 0 && stride_w > 0, - "stride_h: ", - stride_h, - " stride_w: ", - stride_w); - TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w); - TORCH_CHECK(dil_h > 0 && dil_w > 0, "dil_h: ", dil_h, " dil_w: ", dil_w); - - TORCH_CHECK(weight.size(1) * n_weight_grps == input.size(1)); - TORCH_CHECK(weight.size(0) % n_weight_grps == 0); - TORCH_CHECK( - (offset.size(1) == n_offset_grps * 2 * weight_h * weight_w), - "offset.shape[1] is not valid: got: ", - offset.size(1), - " expected: ", - n_offset_grps * 2 * weight_h * weight_w); - TORCH_CHECK( - (!use_mask || mask.size(1) == n_offset_grps * weight_h * weight_w), - "mask.shape[1] is not valid: got: ", - mask.size(1), - " expected: ", - n_offset_grps * weight_h * weight_w); - TORCH_CHECK(input.size(1) % n_offset_grps == 0); - - TORCH_CHECK( - (offset.size(0) == input.size(0)), "invalid batch size of offset"); - TORCH_CHECK( - (offset.size(2) == out_h && offset.size(3) == out_w), - "offset output dims: (", - offset.size(2), - ", ", - offset.size(3), - ") - ", - "computed output dims: (", - out_h, - ", ", - out_w, - ")"); - TORCH_CHECK((mask.size(0) == input.size(0)), "invalid batch size of mask"); - TORCH_CHECK( - (!use_mask || (mask.size(2) == out_h && mask.size(3) == out_w)), - "offset output dims: (", - mask.size(2), - ", ", - mask.size(3), - ") - ", - "computed output dims: (", - out_h, - ", ", - out_w, - ")"); - TORCH_CHECK( - out_h > 0 && out_w > 0, - "Calculated output size too small - out_h: ", - out_h, - " out_w: ", - out_w); - - auto out = at::zeros({batch_sz, out_channels, out_h, out_w}, input.options()); - if (batch_sz == 0) { - return out; - } - - // Separate batches into blocks - out = out.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - out_channels, - out_h, - out_w}); - input = input.view( - {batch_sz / n_parallel_imgs, n_parallel_imgs, n_in_channels, in_h, in_w}); - - offset = offset.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - n_offset_grps * 2 * weight_h * weight_w, - out_h, - out_w}); - - if (use_mask) { - mask = mask.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - n_offset_grps * weight_h * weight_w, - out_h, - out_w}); - } - - at::Tensor out_buf = at::zeros( - {batch_sz / n_parallel_imgs, - out_channels, - n_parallel_imgs * out_h, - out_w}, - out.options()); - - // Separate channels into convolution groups - out_buf = out_buf.view({out_buf.size(0), - n_weight_grps, - out_buf.size(1) / n_weight_grps, - out_buf.size(2), - out_buf.size(3)}); - weight = weight.view({n_weight_grps, - weight.size(0) / n_weight_grps, - weight.size(1), - weight.size(2), - weight.size(3)}); - - // Sample points and perform convolution - auto columns = at::zeros( - {n_in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w}, - input.options()); - for (int b = 0; b < batch_sz / n_parallel_imgs; b++) { - deformable_im2col( - input[b], - offset[b], - mask[b], - n_in_channels, - in_h, - in_w, - weight_h, - weight_w, - pad_h, - pad_w, - stride_h, - stride_w, - dil_h, - dil_w, - out_h, - out_w, - n_parallel_imgs, - n_offset_grps, - use_mask, - columns); - - columns = columns.view( - {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)}); - for (int g = 0; g < n_weight_grps; g++) { - out_buf[b][g] = out_buf[b][g] - .flatten(1) - .addmm_(weight[g].flatten(1), columns[g]) - .view_as(out_buf[b][g]); - } - columns = - columns.view({columns.size(0) * columns.size(1), columns.size(2)}); - } - - out_buf = out_buf.view({batch_sz / n_parallel_imgs, - out_channels, - n_parallel_imgs, - out_h, - out_w}); - out_buf.transpose_(1, 2); - out.copy_(out_buf); - out = out.view({batch_sz, out_channels, out_h, out_w}); - - return out + bias.view({1, out_channels, 1, 1}); -} - template -static void deformable_col2im_kernel( +void deformable_col2im_kernel( int n, const scalar_t* col, const scalar_t* offset, @@ -533,7 +327,7 @@ static void deformable_col2im_kernel( } } -static void compute_grad_input( +void compute_grad_input( const at::Tensor& columns, const at::Tensor& offset, const at::Tensor& mask, @@ -587,7 +381,7 @@ static void compute_grad_input( } template -static scalar_t get_coordinate_weight( +scalar_t get_coordinate_weight( const scalar_t* im_data, int height, int width, @@ -620,7 +414,7 @@ static scalar_t get_coordinate_weight( } template -static void deformable_col2im_coord_kernel( +void deformable_col2im_coord_kernel( int n, const scalar_t* col, const scalar_t* im, @@ -732,7 +526,7 @@ static void deformable_col2im_coord_kernel( } } -static void compute_grad_offset_and_mask( +void compute_grad_offset_and_mask( const at::Tensor& columns, const at::Tensor& input, const at::Tensor& offset, @@ -790,7 +584,7 @@ static void compute_grad_offset_and_mask( })); } -static std::tuple gradient_inputs( +std::tuple gradient_inputs( at::Tensor input, at::Tensor weight, at::Tensor offset, @@ -943,7 +737,7 @@ static std::tuple gradient_inputs( return std::make_tuple(grad_input, grad_offset, grad_mask); } -static at::Tensor backward_gradient_parameters( +at::Tensor backward_gradient_parameters( at::Tensor input, const at::Tensor& weight, at::Tensor offset, @@ -1057,6 +851,216 @@ static at::Tensor backward_gradient_parameters( return grad_weight; } +} // namespace + +at::Tensor DeformConv2d_forward_cpu( + const at::Tensor& input_param, + const at::Tensor& weight_param, + const at::Tensor& offset_param, + const at::Tensor& mask_param, + const at::Tensor& bias_param, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dil_h, + int64_t dil_w, + int64_t n_weight_grps, + int64_t n_offset_grps, + bool use_mask) { + at::Tensor input = input_param.contiguous(); + at::Tensor offset = offset_param.contiguous(); + at::Tensor weight = weight_param.contiguous(); + at::Tensor mask = mask_param.contiguous(); + at::Tensor bias = bias_param.contiguous(); + + TORCH_CHECK(input.ndimension() == 4); + TORCH_CHECK(offset.ndimension() == 4); + TORCH_CHECK(!use_mask || mask.ndimension() == 4); + TORCH_CHECK(weight.ndimension() == 4); + TORCH_CHECK(input.device().is_cpu(), "input must be a CPU tensor"); + + int batch_sz = input.size(0); + int n_in_channels = input.size(1); + int in_h = input.size(2); + int in_w = input.size(3); + + int n_parallel_imgs = + get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); + + // Unpack shapes and args + int out_channels = weight.size(0); + int weight_h = weight.size(2); + int weight_w = weight.size(3); + + int ker_h = dil_h * (weight_h - 1) + 1; + int ker_w = dil_w * (weight_w - 1) + 1; + int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1; + int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1; + + TORCH_CHECK( + weight_h > 0 && weight_w > 0, + "weight_h: ", + weight_h, + " weight_w: ", + weight_w); + TORCH_CHECK( + stride_h > 0 && stride_w > 0, + "stride_h: ", + stride_h, + " stride_w: ", + stride_w); + TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w); + TORCH_CHECK(dil_h > 0 && dil_w > 0, "dil_h: ", dil_h, " dil_w: ", dil_w); + + TORCH_CHECK(weight.size(1) * n_weight_grps == input.size(1)); + TORCH_CHECK(weight.size(0) % n_weight_grps == 0); + TORCH_CHECK( + (offset.size(1) == n_offset_grps * 2 * weight_h * weight_w), + "offset.shape[1] is not valid: got: ", + offset.size(1), + " expected: ", + n_offset_grps * 2 * weight_h * weight_w); + TORCH_CHECK( + (!use_mask || mask.size(1) == n_offset_grps * weight_h * weight_w), + "mask.shape[1] is not valid: got: ", + mask.size(1), + " expected: ", + n_offset_grps * weight_h * weight_w); + TORCH_CHECK(input.size(1) % n_offset_grps == 0); + + TORCH_CHECK( + (offset.size(0) == input.size(0)), "invalid batch size of offset"); + TORCH_CHECK( + (offset.size(2) == out_h && offset.size(3) == out_w), + "offset output dims: (", + offset.size(2), + ", ", + offset.size(3), + ") - ", + "computed output dims: (", + out_h, + ", ", + out_w, + ")"); + TORCH_CHECK((mask.size(0) == input.size(0)), "invalid batch size of mask"); + TORCH_CHECK( + (!use_mask || (mask.size(2) == out_h && mask.size(3) == out_w)), + "offset output dims: (", + mask.size(2), + ", ", + mask.size(3), + ") - ", + "computed output dims: (", + out_h, + ", ", + out_w, + ")"); + TORCH_CHECK( + out_h > 0 && out_w > 0, + "Calculated output size too small - out_h: ", + out_h, + " out_w: ", + out_w); + + auto out = at::zeros({batch_sz, out_channels, out_h, out_w}, input.options()); + if (batch_sz == 0) { + return out; + } + + // Separate batches into blocks + out = out.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + out_channels, + out_h, + out_w}); + input = input.view( + {batch_sz / n_parallel_imgs, n_parallel_imgs, n_in_channels, in_h, in_w}); + + offset = offset.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + n_offset_grps * 2 * weight_h * weight_w, + out_h, + out_w}); + + if (use_mask) { + mask = mask.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + n_offset_grps * weight_h * weight_w, + out_h, + out_w}); + } + + at::Tensor out_buf = at::zeros( + {batch_sz / n_parallel_imgs, + out_channels, + n_parallel_imgs * out_h, + out_w}, + out.options()); + + // Separate channels into convolution groups + out_buf = out_buf.view({out_buf.size(0), + n_weight_grps, + out_buf.size(1) / n_weight_grps, + out_buf.size(2), + out_buf.size(3)}); + weight = weight.view({n_weight_grps, + weight.size(0) / n_weight_grps, + weight.size(1), + weight.size(2), + weight.size(3)}); + + // Sample points and perform convolution + auto columns = at::zeros( + {n_in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w}, + input.options()); + for (int b = 0; b < batch_sz / n_parallel_imgs; b++) { + deformable_im2col( + input[b], + offset[b], + mask[b], + n_in_channels, + in_h, + in_w, + weight_h, + weight_w, + pad_h, + pad_w, + stride_h, + stride_w, + dil_h, + dil_w, + out_h, + out_w, + n_parallel_imgs, + n_offset_grps, + use_mask, + columns); + + columns = columns.view( + {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)}); + for (int g = 0; g < n_weight_grps; g++) { + out_buf[b][g] = out_buf[b][g] + .flatten(1) + .addmm_(weight[g].flatten(1), columns[g]) + .view_as(out_buf[b][g]); + } + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + } + + out_buf = out_buf.view({batch_sz / n_parallel_imgs, + out_channels, + n_parallel_imgs, + out_h, + out_w}); + out_buf.transpose_(1, 2); + out.copy_(out_buf); + out = out.view({batch_sz, out_channels, out_h, out_w}); + + return out + bias.view({1, out_channels, 1, 1}); +} + std::tuple DeformConv2d_backward_cpu( const at::Tensor& grad_out_param, diff --git a/torchvision/csrc/cuda/DeformConv2d_cuda.cu b/torchvision/csrc/cuda/DeformConv2d_cuda.cu index ab4cf64dc43..43bb2292e78 100644 --- a/torchvision/csrc/cuda/DeformConv2d_cuda.cu +++ b/torchvision/csrc/cuda/DeformConv2d_cuda.cu @@ -78,6 +78,8 @@ #include #include +namespace { + const int kMaxParallelImgs = 32; inline unsigned int GET_THREADS() { @@ -205,7 +207,7 @@ __global__ void deformable_im2col_kernel( } } -static void deformable_im2col( +void deformable_im2col( const at::Tensor& input, const at::Tensor& data_offset, const at::Tensor& data_mask, @@ -265,7 +267,7 @@ static void deformable_im2col( } } -static int get_greatest_divisor_below_bound(int n, int bound) { +int get_greatest_divisor_below_bound(int n, int bound) { for (int k = bound; k > 1; --k) { if (n % k == 0) { return k; @@ -274,215 +276,6 @@ static int get_greatest_divisor_below_bound(int n, int bound) { return 1; } -at::Tensor DeformConv2d_forward_cuda( - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dil_h, - int64_t dil_w, - int64_t n_weight_grps, - int64_t n_offset_grps, - bool use_mask) { - at::Tensor input = input_param.contiguous(); - at::Tensor offset = offset_param.contiguous(); - at::Tensor weight = weight_param.contiguous(); - at::Tensor mask = mask_param.contiguous(); - at::Tensor bias = bias_param.contiguous(); - - TORCH_CHECK(input.ndimension() == 4); - TORCH_CHECK(offset.ndimension() == 4); - TORCH_CHECK(!use_mask || mask.ndimension() == 4); - TORCH_CHECK(weight.ndimension() == 4); - TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor"); - - at::DeviceGuard guard(input.device()); - - int batch_sz = input.size(0); - int in_channels = input.size(1); - int in_h = input.size(2); - int in_w = input.size(3); - - int n_parallel_imgs = - get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); - - int out_channels = weight.size(0); - int weight_h = weight.size(2); - int weight_w = weight.size(3); - - int ker_h = dil_h * (weight_h - 1) + 1; - int ker_w = dil_w * (weight_w - 1) + 1; - int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1; - int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1; - - TORCH_CHECK( - weight_h > 0 && weight_w > 0, - "weight_h: ", - weight_h, - " weight_w: ", - weight_w); - TORCH_CHECK( - stride_h > 0 && stride_w > 0, - "stride_h: ", - stride_h, - " stride_w: ", - stride_w); - TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w); - TORCH_CHECK(dil_h > 0 && dil_w > 0, "dil_h: ", dil_h, " dil_w: ", dil_w); - - TORCH_CHECK(weight.size(1) * n_weight_grps == input.size(1)); - TORCH_CHECK(weight.size(0) % n_weight_grps == 0); - TORCH_CHECK( - (offset.size(1) == n_offset_grps * 2 * weight_h * weight_w), - "offset.shape[1] is not valid: got: ", - offset.size(1), - " expected: ", - n_offset_grps * 2 * weight_h * weight_w); - TORCH_CHECK( - (!use_mask || mask.size(1) == n_offset_grps * weight_h * weight_w), - "mask.shape[1] is not valid: got: ", - mask.size(1), - " expected: ", - n_offset_grps * weight_h * weight_w); - TORCH_CHECK(input.size(1) % n_offset_grps == 0); - - TORCH_CHECK( - (offset.size(0) == input.size(0)), "invalid batch size of offset"); - TORCH_CHECK( - (offset.size(2) == out_h && offset.size(3) == out_w), - "offset output dims: (", - offset.size(2), - ", ", - offset.size(3), - ") - ", - "computed output dims: (", - out_h, - ", ", - out_w, - ")"); - TORCH_CHECK((mask.size(0) == input.size(0)), "invalid batch size of mask"); - TORCH_CHECK( - (!use_mask || (mask.size(2) == out_h && mask.size(3) == out_w)), - "mask output dims: (", - mask.size(2), - ", ", - mask.size(3), - ") - ", - "computed output dims: (", - out_h, - ", ", - out_w, - ")"); - TORCH_CHECK( - out_h > 0 && out_w > 0, - "Calculated output size too small - out_h: ", - out_h, - " out_w: ", - out_w); - - auto out = at::zeros({batch_sz, out_channels, out_h, out_w}, input.options()); - if (batch_sz == 0) { - return out; - } - - // Separate batches into blocks - out = out.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - out_channels, - out_h, - out_w}); - input = input.view( - {batch_sz / n_parallel_imgs, n_parallel_imgs, in_channels, in_h, in_w}); - - offset = offset.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - n_offset_grps * 2 * weight_h * weight_w, - out_h, - out_w}); - - if (use_mask) { - mask = mask.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - n_offset_grps * weight_h * weight_w, - out_h, - out_w}); - } - - at::Tensor out_buf = at::zeros( - {batch_sz / n_parallel_imgs, - out_channels, - n_parallel_imgs * out_h, - out_w}, - out.options()); - - // Separate channels into convolution groups - out_buf = out_buf.view({out_buf.size(0), - n_weight_grps, - out_buf.size(1) / n_weight_grps, - out_buf.size(2), - out_buf.size(3)}); - weight = weight.view({n_weight_grps, - weight.size(0) / n_weight_grps, - weight.size(1), - weight.size(2), - weight.size(3)}); - - // Sample points and perform convolution - auto columns = at::zeros( - {in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w}, - input.options()); - for (int b = 0; b < batch_sz / n_parallel_imgs; b++) { - deformable_im2col( - input[b], - offset[b], - mask[b], - in_channels, - in_h, - in_w, - weight_h, - weight_w, - pad_h, - pad_w, - stride_h, - stride_w, - dil_h, - dil_w, - out_h, - out_w, - n_parallel_imgs, - n_offset_grps, - use_mask, - columns); - - columns = columns.view( - {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)}); - for (int g = 0; g < n_weight_grps; g++) { - out_buf[b][g] = out_buf[b][g] - .flatten(1) - .addmm_(weight[g].flatten(1), columns[g]) - .view_as(out_buf[b][g]); - } - columns = - columns.view({columns.size(0) * columns.size(1), columns.size(2)}); - } - - out_buf = out_buf.view({batch_sz / n_parallel_imgs, - out_channels, - n_parallel_imgs, - out_h, - out_w}); - out_buf.transpose_(1, 2); - out.copy_(out_buf); - out = out.view({batch_sz, out_channels, out_h, out_w}); - - return out + bias.view({1, out_channels, 1, 1}); -} - template __global__ void deformable_col2im_kernel( int n, @@ -557,7 +350,7 @@ __global__ void deformable_col2im_kernel( } } -static void compute_grad_input( +void compute_grad_input( const at::Tensor& columns, const at::Tensor& offset, const at::Tensor& mask, @@ -763,7 +556,7 @@ __global__ void deformable_col2im_coord_kernel( } } -static void compute_grad_offset_and_mask( +void compute_grad_offset_and_mask( const at::Tensor& columns, const at::Tensor& input, const at::Tensor& offset, @@ -832,7 +625,7 @@ static void compute_grad_offset_and_mask( } } -static std::tuple backward_gradient_inputs( +std::tuple backward_gradient_inputs( at::Tensor input, at::Tensor weight, at::Tensor offset, @@ -986,7 +779,7 @@ static std::tuple backward_gradient_inputs( return std::make_tuple(grad_input, grad_offset, grad_mask); } -static at::Tensor backward_gradient_parameters( +at::Tensor backward_gradient_parameters( at::Tensor input, const at::Tensor& weight, at::Tensor offset, @@ -1102,6 +895,217 @@ static at::Tensor backward_gradient_parameters( return grad_weight; } +} // namespace + +at::Tensor DeformConv2d_forward_cuda( + const at::Tensor& input_param, + const at::Tensor& weight_param, + const at::Tensor& offset_param, + const at::Tensor& mask_param, + const at::Tensor& bias_param, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dil_h, + int64_t dil_w, + int64_t n_weight_grps, + int64_t n_offset_grps, + bool use_mask) { + at::Tensor input = input_param.contiguous(); + at::Tensor offset = offset_param.contiguous(); + at::Tensor weight = weight_param.contiguous(); + at::Tensor mask = mask_param.contiguous(); + at::Tensor bias = bias_param.contiguous(); + + TORCH_CHECK(input.ndimension() == 4); + TORCH_CHECK(offset.ndimension() == 4); + TORCH_CHECK(!use_mask || mask.ndimension() == 4); + TORCH_CHECK(weight.ndimension() == 4); + TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor"); + + at::DeviceGuard guard(input.device()); + + int batch_sz = input.size(0); + int in_channels = input.size(1); + int in_h = input.size(2); + int in_w = input.size(3); + + int n_parallel_imgs = + get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); + + int out_channels = weight.size(0); + int weight_h = weight.size(2); + int weight_w = weight.size(3); + + int ker_h = dil_h * (weight_h - 1) + 1; + int ker_w = dil_w * (weight_w - 1) + 1; + int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1; + int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1; + + TORCH_CHECK( + weight_h > 0 && weight_w > 0, + "weight_h: ", + weight_h, + " weight_w: ", + weight_w); + TORCH_CHECK( + stride_h > 0 && stride_w > 0, + "stride_h: ", + stride_h, + " stride_w: ", + stride_w); + TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w); + TORCH_CHECK(dil_h > 0 && dil_w > 0, "dil_h: ", dil_h, " dil_w: ", dil_w); + + TORCH_CHECK(weight.size(1) * n_weight_grps == input.size(1)); + TORCH_CHECK(weight.size(0) % n_weight_grps == 0); + TORCH_CHECK( + (offset.size(1) == n_offset_grps * 2 * weight_h * weight_w), + "offset.shape[1] is not valid: got: ", + offset.size(1), + " expected: ", + n_offset_grps * 2 * weight_h * weight_w); + TORCH_CHECK( + (!use_mask || mask.size(1) == n_offset_grps * weight_h * weight_w), + "mask.shape[1] is not valid: got: ", + mask.size(1), + " expected: ", + n_offset_grps * weight_h * weight_w); + TORCH_CHECK(input.size(1) % n_offset_grps == 0); + + TORCH_CHECK( + (offset.size(0) == input.size(0)), "invalid batch size of offset"); + TORCH_CHECK( + (offset.size(2) == out_h && offset.size(3) == out_w), + "offset output dims: (", + offset.size(2), + ", ", + offset.size(3), + ") - ", + "computed output dims: (", + out_h, + ", ", + out_w, + ")"); + TORCH_CHECK((mask.size(0) == input.size(0)), "invalid batch size of mask"); + TORCH_CHECK( + (!use_mask || (mask.size(2) == out_h && mask.size(3) == out_w)), + "mask output dims: (", + mask.size(2), + ", ", + mask.size(3), + ") - ", + "computed output dims: (", + out_h, + ", ", + out_w, + ")"); + TORCH_CHECK( + out_h > 0 && out_w > 0, + "Calculated output size too small - out_h: ", + out_h, + " out_w: ", + out_w); + + auto out = at::zeros({batch_sz, out_channels, out_h, out_w}, input.options()); + if (batch_sz == 0) { + return out; + } + + // Separate batches into blocks + out = out.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + out_channels, + out_h, + out_w}); + input = input.view( + {batch_sz / n_parallel_imgs, n_parallel_imgs, in_channels, in_h, in_w}); + + offset = offset.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + n_offset_grps * 2 * weight_h * weight_w, + out_h, + out_w}); + + if (use_mask) { + mask = mask.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + n_offset_grps * weight_h * weight_w, + out_h, + out_w}); + } + + at::Tensor out_buf = at::zeros( + {batch_sz / n_parallel_imgs, + out_channels, + n_parallel_imgs * out_h, + out_w}, + out.options()); + + // Separate channels into convolution groups + out_buf = out_buf.view({out_buf.size(0), + n_weight_grps, + out_buf.size(1) / n_weight_grps, + out_buf.size(2), + out_buf.size(3)}); + weight = weight.view({n_weight_grps, + weight.size(0) / n_weight_grps, + weight.size(1), + weight.size(2), + weight.size(3)}); + + // Sample points and perform convolution + auto columns = at::zeros( + {in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w}, + input.options()); + for (int b = 0; b < batch_sz / n_parallel_imgs; b++) { + deformable_im2col( + input[b], + offset[b], + mask[b], + in_channels, + in_h, + in_w, + weight_h, + weight_w, + pad_h, + pad_w, + stride_h, + stride_w, + dil_h, + dil_w, + out_h, + out_w, + n_parallel_imgs, + n_offset_grps, + use_mask, + columns); + + columns = columns.view( + {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)}); + for (int g = 0; g < n_weight_grps; g++) { + out_buf[b][g] = out_buf[b][g] + .flatten(1) + .addmm_(weight[g].flatten(1), columns[g]) + .view_as(out_buf[b][g]); + } + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + } + + out_buf = out_buf.view({batch_sz / n_parallel_imgs, + out_channels, + n_parallel_imgs, + out_h, + out_w}); + out_buf.transpose_(1, 2); + out.copy_(out_buf); + out = out.view({batch_sz, out_channels, out_h, out_w}); + + return out + bias.view({1, out_channels, 1, 1}); +} + std::tuple DeformConv2d_backward_cuda( const at::Tensor& grad_out_param, From 88596ab584a68942e5ddd191f46fd2e4568ba69e Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 12 Nov 2020 23:05:17 +0000 Subject: [PATCH 04/11] Applying C++ naming rules and alinging variable names across headers and cpp files. --- ...rmConv2d_cpu.cpp => deform_conv2d_cpu.cpp} | 4 +- torchvision/csrc/cpu/vision_cpu.h | 42 +++++++++---------- ...rmConv2d_cuda.cu => deform_conv2d_cuda.cu} | 4 +- torchvision/csrc/cuda/vision_cuda.h | 42 +++++++++---------- .../csrc/{DeformConv2d.h => deform_conv2d.h} | 6 +-- torchvision/csrc/vision.cpp | 16 +++---- 6 files changed, 57 insertions(+), 57 deletions(-) rename torchvision/csrc/cpu/{DeformConv2d_cpu.cpp => deform_conv2d_cpu.cpp} (99%) rename torchvision/csrc/cuda/{DeformConv2d_cuda.cu => deform_conv2d_cuda.cu} (99%) rename torchvision/csrc/{DeformConv2d.h => deform_conv2d.h} (98%) diff --git a/torchvision/csrc/cpu/DeformConv2d_cpu.cpp b/torchvision/csrc/cpu/deform_conv2d_cpu.cpp similarity index 99% rename from torchvision/csrc/cpu/DeformConv2d_cpu.cpp rename to torchvision/csrc/cpu/deform_conv2d_cpu.cpp index 197f5176e95..98f6a46facb 100644 --- a/torchvision/csrc/cpu/DeformConv2d_cpu.cpp +++ b/torchvision/csrc/cpu/deform_conv2d_cpu.cpp @@ -853,7 +853,7 @@ at::Tensor backward_gradient_parameters( } // namespace -at::Tensor DeformConv2d_forward_cpu( +at::Tensor deform_conv2d_forward_cpu( const at::Tensor& input_param, const at::Tensor& weight_param, const at::Tensor& offset_param, @@ -1062,7 +1062,7 @@ at::Tensor DeformConv2d_forward_cpu( } std::tuple -DeformConv2d_backward_cpu( +deform_conv2d_backward_cpu( const at::Tensor& grad_out_param, const at::Tensor& input_param, const at::Tensor& weight_param, diff --git a/torchvision/csrc/cpu/vision_cpu.h b/torchvision/csrc/cpu/vision_cpu.h index d5bfcc0de24..26a4221d7c7 100644 --- a/torchvision/csrc/cpu/vision_cpu.h +++ b/torchvision/csrc/cpu/vision_cpu.h @@ -2,39 +2,39 @@ #include #include "../macros.h" -VISION_API at::Tensor DeformConv2d_forward_cpu( - const at::Tensor& input, - const at::Tensor& weight, - const at::Tensor& offset, - const at::Tensor& mask, - const at::Tensor& bias, +VISION_API at::Tensor deform_conv2d_forward_cpu( + const at::Tensor& input_param, + const at::Tensor& weight_param, + const at::Tensor& offset_param, + const at::Tensor& mask_param, + const at::Tensor& bias_param, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dilation_h, - int64_t dilation_w, - int64_t groups, - int64_t deformable_groups, + int64_t dil_h, + int64_t dil_w, + int64_t n_weight_grps, + int64_t n_offset_grps, bool use_mask); VISION_API std:: tuple - DeformConv2d_backward_cpu( - const at::Tensor& grad_out, - const at::Tensor& input, - const at::Tensor& weight, - const at::Tensor& offset, - const at::Tensor& mask, - const at::Tensor& bias, + deform_conv2d_backward_cpu( + const at::Tensor& grad_out_param, + const at::Tensor& input_param, + const at::Tensor& weight_param, + const at::Tensor& offset_param, + const at::Tensor& mask_param, + const at::Tensor& bias_param, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dilation_h, - int64_t dilation_w, - int64_t groups, - int64_t deformable_groups, + int64_t dil_h, + int64_t dil_w, + int64_t n_weight_grps, + int64_t n_offset_grps, bool use_mask); VISION_API at::Tensor nms_cpu( diff --git a/torchvision/csrc/cuda/DeformConv2d_cuda.cu b/torchvision/csrc/cuda/deform_conv2d_cuda.cu similarity index 99% rename from torchvision/csrc/cuda/DeformConv2d_cuda.cu rename to torchvision/csrc/cuda/deform_conv2d_cuda.cu index 43bb2292e78..1b34d687321 100644 --- a/torchvision/csrc/cuda/DeformConv2d_cuda.cu +++ b/torchvision/csrc/cuda/deform_conv2d_cuda.cu @@ -897,7 +897,7 @@ at::Tensor backward_gradient_parameters( } // namespace -at::Tensor DeformConv2d_forward_cuda( +at::Tensor deform_conv2d_forward_cuda( const at::Tensor& input_param, const at::Tensor& weight_param, const at::Tensor& offset_param, @@ -1107,7 +1107,7 @@ at::Tensor DeformConv2d_forward_cuda( } std::tuple -DeformConv2d_backward_cuda( +deform_conv2d_backward_cuda( const at::Tensor& grad_out_param, const at::Tensor& input_param, const at::Tensor& weight_param, diff --git a/torchvision/csrc/cuda/vision_cuda.h b/torchvision/csrc/cuda/vision_cuda.h index bf57f1c7967..1119d331cac 100644 --- a/torchvision/csrc/cuda/vision_cuda.h +++ b/torchvision/csrc/cuda/vision_cuda.h @@ -2,39 +2,39 @@ #include #include "../macros.h" -VISION_API at::Tensor DeformConv2d_forward_cuda( - const at::Tensor& input, - const at::Tensor& weight, - const at::Tensor& offset, - const at::Tensor& mask, - const at::Tensor& bias, +VISION_API at::Tensor deform_conv2d_forward_cuda( + const at::Tensor& input_param, + const at::Tensor& weight_param, + const at::Tensor& offset_param, + const at::Tensor& mask_param, + const at::Tensor& bias_param, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dilation_h, - int64_t dilation_w, - int64_t groups, - int64_t deformable_groups, + int64_t dil_h, + int64_t dil_w, + int64_t n_weight_grps, + int64_t n_offset_grps, bool use_mask); VISION_API std:: tuple - DeformConv2d_backward_cuda( - const at::Tensor& grad_out, - const at::Tensor& input, - const at::Tensor& weight, - const at::Tensor& offset, - const at::Tensor& mask, - const at::Tensor& bias, + deform_conv2d_backward_cuda( + const at::Tensor& grad_out_param, + const at::Tensor& input_param, + const at::Tensor& weight_param, + const at::Tensor& offset_param, + const at::Tensor& mask_param, + const at::Tensor& bias_param, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dilation_h, - int64_t dilation_w, - int64_t groups, - int64_t deformable_groups, + int64_t dil_h, + int64_t dil_w, + int64_t n_weight_grps, + int64_t n_offset_grps, bool use_mask); VISION_API at::Tensor nms_cuda( diff --git a/torchvision/csrc/DeformConv2d.h b/torchvision/csrc/deform_conv2d.h similarity index 98% rename from torchvision/csrc/DeformConv2d.h rename to torchvision/csrc/deform_conv2d.h index 1fe1b362b1c..e032f65b493 100644 --- a/torchvision/csrc/DeformConv2d.h +++ b/torchvision/csrc/deform_conv2d.h @@ -264,7 +264,7 @@ class DeformConv2dBackwardFunction } // namespace #if defined(WITH_CUDA) || defined(WITH_HIP) -at::Tensor DeformConv2d_autocast( +at::Tensor deform_conv2d_autocast( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& offset, @@ -299,7 +299,7 @@ at::Tensor DeformConv2d_autocast( } #endif -at::Tensor DeformConv2d_autograd( +at::Tensor deform_conv2d_autograd( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& offset, @@ -332,7 +332,7 @@ at::Tensor DeformConv2d_autograd( } std::tuple -DeformConv2d_backward_autograd( +deform_conv2d_backward_autograd( const at::Tensor& grad, const at::Tensor& input, const at::Tensor& weight, diff --git a/torchvision/csrc/vision.cpp b/torchvision/csrc/vision.cpp index f43e18db461..fb89d1f5d3c 100644 --- a/torchvision/csrc/vision.cpp +++ b/torchvision/csrc/vision.cpp @@ -8,11 +8,11 @@ #include #endif -#include "DeformConv2d.h" #include "PSROIAlign.h" #include "PSROIPool.h" #include "ROIAlign.h" #include "ROIPool.h" +#include "deform_conv2d.h" #include "empty_tensor_op.h" #include "nms.h" @@ -71,8 +71,8 @@ TORCH_LIBRARY(torchvision, m) { } TORCH_LIBRARY_IMPL(torchvision, CPU, m) { - m.impl("deform_conv2d", DeformConv2d_forward_cpu); - m.impl("_deform_conv2d_backward", DeformConv2d_backward_cpu); + m.impl("deform_conv2d", deform_conv2d_forward_cpu); + m.impl("_deform_conv2d_backward", deform_conv2d_backward_cpu); m.impl("nms", nms_cpu); m.impl("ps_roi_align", PSROIAlign_forward_cpu); m.impl("_ps_roi_align_backward", PSROIAlign_backward_cpu); @@ -87,8 +87,8 @@ TORCH_LIBRARY_IMPL(torchvision, CPU, m) { // TODO: Place this in a hypothetical separate torchvision_cuda library #if defined(WITH_CUDA) || defined(WITH_HIP) TORCH_LIBRARY_IMPL(torchvision, CUDA, m) { - m.impl("deform_conv2d", DeformConv2d_forward_cuda); - m.impl("_deform_conv2d_backward", DeformConv2d_backward_cuda); + m.impl("deform_conv2d", deform_conv2d_forward_cuda); + m.impl("_deform_conv2d_backward", deform_conv2d_backward_cuda); m.impl("nms", nms_cuda); m.impl("ps_roi_align", PSROIAlign_forward_cuda); m.impl("_ps_roi_align_backward", PSROIAlign_backward_cuda); @@ -104,7 +104,7 @@ TORCH_LIBRARY_IMPL(torchvision, CUDA, m) { // Autocast only needs to wrap forward pass ops. #if defined(WITH_CUDA) || defined(WITH_HIP) TORCH_LIBRARY_IMPL(torchvision, Autocast, m) { - m.impl("deform_conv2d", DeformConv2d_autocast); + m.impl("deform_conv2d", deform_conv2d_autocast); m.impl("nms", nms_autocast); m.impl("ps_roi_align", PSROIAlign_autocast); m.impl("ps_roi_pool", PSROIPool_autocast); @@ -114,8 +114,8 @@ TORCH_LIBRARY_IMPL(torchvision, Autocast, m) { #endif TORCH_LIBRARY_IMPL(torchvision, Autograd, m) { - m.impl("deform_conv2d", DeformConv2d_autograd); - m.impl("_deform_conv2d_backward", DeformConv2d_backward_autograd); + m.impl("deform_conv2d", deform_conv2d_autograd); + m.impl("_deform_conv2d_backward", deform_conv2d_backward_autograd); m.impl("ps_roi_align", PSROIAlign_autograd); m.impl("_ps_roi_align_backward", PSROIAlign_backward_autograd); m.impl("ps_roi_pool", PSROIPool_autograd); From aefe9341cd58d707195cb0317119c5005c244fd7 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 12 Nov 2020 23:09:54 +0000 Subject: [PATCH 05/11] Syncing names across implementations. --- torchvision/csrc/cpu/deform_conv2d_cpu.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchvision/csrc/cpu/deform_conv2d_cpu.cpp b/torchvision/csrc/cpu/deform_conv2d_cpu.cpp index 98f6a46facb..10a161303f6 100644 --- a/torchvision/csrc/cpu/deform_conv2d_cpu.cpp +++ b/torchvision/csrc/cpu/deform_conv2d_cpu.cpp @@ -584,7 +584,7 @@ void compute_grad_offset_and_mask( })); } -std::tuple gradient_inputs( +std::tuple backward_gradient_inputs( at::Tensor input, at::Tensor weight, at::Tensor offset, @@ -1089,7 +1089,7 @@ deform_conv2d_backward_cpu( const int n_parallel_imgs = get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); - auto grad_input_and_offset_and_mask = gradient_inputs( + auto grad_input_and_offset_and_mask = backward_gradient_inputs( input, weight, offset, From 602acb2e535a9935f55c0d3a10de8f9015e362f0 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 26 Nov 2020 15:17:41 +0000 Subject: [PATCH 06/11] Rename deform_conv2d.h to deform_conv2d.cpp --- torchvision/csrc/{deform_conv2d.h => deform_conv2d.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename torchvision/csrc/{deform_conv2d.h => deform_conv2d.cpp} (100%) diff --git a/torchvision/csrc/deform_conv2d.h b/torchvision/csrc/deform_conv2d.cpp similarity index 100% rename from torchvision/csrc/deform_conv2d.h rename to torchvision/csrc/deform_conv2d.cpp From 9026df05da88f5d6084c4451ee798aa703f8a402 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 26 Nov 2020 19:19:48 +0000 Subject: [PATCH 07/11] Use header files: - Create header files for kernel implementation and remove definitions from vision_*.h files. - Eliminate unnecessary headers and ensure all cpp include their headers. --- torchvision/csrc/autocast.h | 2 + torchvision/csrc/cpu/deform_conv2d_cpu.cpp | 8 +-- torchvision/csrc/cpu/deform_conv2d_cpu.h | 39 +++++++++++++ torchvision/csrc/cpu/vision_cpu.h | 35 +---------- torchvision/csrc/cuda/deform_conv2d_cuda.cu | 6 +- torchvision/csrc/cuda/deform_conv2d_cuda.h | 39 +++++++++++++ torchvision/csrc/cuda/vision_cuda.h | 35 +---------- torchvision/csrc/deform_conv2d.cpp | 14 ++--- torchvision/csrc/deform_conv2d.h | 64 +++++++++++++++++++++ 9 files changed, 152 insertions(+), 90 deletions(-) create mode 100644 torchvision/csrc/cpu/deform_conv2d_cpu.h create mode 100644 torchvision/csrc/cuda/deform_conv2d_cuda.h create mode 100644 torchvision/csrc/deform_conv2d.h diff --git a/torchvision/csrc/autocast.h b/torchvision/csrc/autocast.h index 1f954464b72..584ef13f389 100644 --- a/torchvision/csrc/autocast.h +++ b/torchvision/csrc/autocast.h @@ -1,5 +1,7 @@ #pragma once +// TODO: Delete this file once none of the methods use it + #if defined(WITH_CUDA) || defined(WITH_HIP) #include #endif diff --git a/torchvision/csrc/cpu/deform_conv2d_cpu.cpp b/torchvision/csrc/cpu/deform_conv2d_cpu.cpp index 10a161303f6..ed18c6a8be5 100644 --- a/torchvision/csrc/cpu/deform_conv2d_cpu.cpp +++ b/torchvision/csrc/cpu/deform_conv2d_cpu.cpp @@ -66,13 +66,7 @@ // modified from // https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp -#include -#include -#include - -#include -#include -#include +#include "deform_conv2d_cpu.h" namespace { diff --git a/torchvision/csrc/cpu/deform_conv2d_cpu.h b/torchvision/csrc/cpu/deform_conv2d_cpu.h new file mode 100644 index 00000000000..2eb9375ca1a --- /dev/null +++ b/torchvision/csrc/cpu/deform_conv2d_cpu.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include "../macros.h" + +VISION_API at::Tensor deform_conv2d_forward_cpu( + const at::Tensor& input_param, + const at::Tensor& weight_param, + const at::Tensor& offset_param, + const at::Tensor& mask_param, + const at::Tensor& bias_param, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dil_h, + int64_t dil_w, + int64_t n_weight_grps, + int64_t n_offset_grps, + bool use_mask); + +VISION_API std:: + tuple + deform_conv2d_backward_cpu( + const at::Tensor& grad_out_param, + const at::Tensor& input_param, + const at::Tensor& weight_param, + const at::Tensor& offset_param, + const at::Tensor& mask_param, + const at::Tensor& bias_param, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dil_h, + int64_t dil_w, + int64_t n_weight_grps, + int64_t n_offset_grps, + bool use_mask); diff --git a/torchvision/csrc/cpu/vision_cpu.h b/torchvision/csrc/cpu/vision_cpu.h index 26a4221d7c7..6f85d9c0256 100644 --- a/torchvision/csrc/cpu/vision_cpu.h +++ b/torchvision/csrc/cpu/vision_cpu.h @@ -2,40 +2,7 @@ #include #include "../macros.h" -VISION_API at::Tensor deform_conv2d_forward_cpu( - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dil_h, - int64_t dil_w, - int64_t n_weight_grps, - int64_t n_offset_grps, - bool use_mask); - -VISION_API std:: - tuple - deform_conv2d_backward_cpu( - const at::Tensor& grad_out_param, - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dil_h, - int64_t dil_w, - int64_t n_weight_grps, - int64_t n_offset_grps, - bool use_mask); +// TODO: Delete this file once all the methods are gone VISION_API at::Tensor nms_cpu( const at::Tensor& dets, diff --git a/torchvision/csrc/cuda/deform_conv2d_cuda.cu b/torchvision/csrc/cuda/deform_conv2d_cuda.cu index 03596eb97f7..eb1d8309d0f 100644 --- a/torchvision/csrc/cuda/deform_conv2d_cuda.cu +++ b/torchvision/csrc/cuda/deform_conv2d_cuda.cu @@ -67,17 +67,13 @@ // https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp #include -#include #include #include #include +#include "deform_conv2d_cuda.h" #include "cuda_helpers.h" -#include -#include -#include - namespace { const int kMaxParallelImgs = 32; diff --git a/torchvision/csrc/cuda/deform_conv2d_cuda.h b/torchvision/csrc/cuda/deform_conv2d_cuda.h new file mode 100644 index 00000000000..c9afe68849d --- /dev/null +++ b/torchvision/csrc/cuda/deform_conv2d_cuda.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include "../macros.h" + +VISION_API at::Tensor deform_conv2d_forward_cuda( + const at::Tensor& input_param, + const at::Tensor& weight_param, + const at::Tensor& offset_param, + const at::Tensor& mask_param, + const at::Tensor& bias_param, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dil_h, + int64_t dil_w, + int64_t n_weight_grps, + int64_t n_offset_grps, + bool use_mask); + +VISION_API std:: + tuple + deform_conv2d_backward_cuda( + const at::Tensor& grad_out_param, + const at::Tensor& input_param, + const at::Tensor& weight_param, + const at::Tensor& offset_param, + const at::Tensor& mask_param, + const at::Tensor& bias_param, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dil_h, + int64_t dil_w, + int64_t n_weight_grps, + int64_t n_offset_grps, + bool use_mask); diff --git a/torchvision/csrc/cuda/vision_cuda.h b/torchvision/csrc/cuda/vision_cuda.h index 1119d331cac..834973c5327 100644 --- a/torchvision/csrc/cuda/vision_cuda.h +++ b/torchvision/csrc/cuda/vision_cuda.h @@ -2,40 +2,7 @@ #include #include "../macros.h" -VISION_API at::Tensor deform_conv2d_forward_cuda( - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dil_h, - int64_t dil_w, - int64_t n_weight_grps, - int64_t n_offset_grps, - bool use_mask); - -VISION_API std:: - tuple - deform_conv2d_backward_cuda( - const at::Tensor& grad_out_param, - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dil_h, - int64_t dil_w, - int64_t n_weight_grps, - int64_t n_offset_grps, - bool use_mask); +// TODO: Delete this file once all the methods are gone VISION_API at::Tensor nms_cuda( const at::Tensor& dets, diff --git a/torchvision/csrc/deform_conv2d.cpp b/torchvision/csrc/deform_conv2d.cpp index e032f65b493..062a6a3e9d9 100644 --- a/torchvision/csrc/deform_conv2d.cpp +++ b/torchvision/csrc/deform_conv2d.cpp @@ -1,14 +1,8 @@ -#pragma once +#include "deform_conv2d.h" +#include -#include "cpu/vision_cpu.h" - -#ifdef WITH_CUDA -#include "autocast.h" -#include "cuda/vision_cuda.h" -#endif -#ifdef WITH_HIP -#include "autocast.h" -#include "hip/vision_cuda.h" +#if defined(WITH_CUDA) || defined(WITH_HIP) +#include #endif namespace { diff --git a/torchvision/csrc/deform_conv2d.h b/torchvision/csrc/deform_conv2d.h new file mode 100644 index 00000000000..e6250a1847b --- /dev/null +++ b/torchvision/csrc/deform_conv2d.h @@ -0,0 +1,64 @@ +#pragma once + +#include "cpu/deform_conv2d_cpu.h" + +#ifdef WITH_CUDA +#include "cuda/deform_conv2d_cuda.h" +#endif +#ifdef WITH_HIP +#include "hip/deform_conv2d_cuda.h" +#endif + +// Autocast Registration +#if defined(WITH_CUDA) || defined(WITH_HIP) +at::Tensor deform_conv2d_autocast( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t groups, + int64_t offset_groups, + bool use_mask); +#endif + +// Autograd Registration +at::Tensor deform_conv2d_autograd( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t groups, + int64_t offset_groups, + bool use_mask); + +std::tuple +deform_conv2d_backward_autograd( + const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t groups, + int64_t offset_groups, + bool use_mask); From ec11906f6b2a74ca37686e15a763207fda14433f Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 26 Nov 2020 20:45:07 +0000 Subject: [PATCH 08/11] Change the naming convention for kernel implementations. --- .../cpu/{deform_conv2d_cpu.cpp => deform_conv2d_kernel.cpp} | 2 +- .../cpu/{deform_conv2d_cpu.h => deform_conv2d_kernel.h} | 0 .../cuda/{deform_conv2d_cuda.cu => deform_conv2d_kernel.cu} | 2 +- .../cuda/{deform_conv2d_cuda.h => deform_conv2d_kernel.h} | 0 torchvision/csrc/deform_conv2d.h | 6 +++--- 5 files changed, 5 insertions(+), 5 deletions(-) rename torchvision/csrc/cpu/{deform_conv2d_cpu.cpp => deform_conv2d_kernel.cpp} (99%) rename torchvision/csrc/cpu/{deform_conv2d_cpu.h => deform_conv2d_kernel.h} (100%) rename torchvision/csrc/cuda/{deform_conv2d_cuda.cu => deform_conv2d_kernel.cu} (99%) rename torchvision/csrc/cuda/{deform_conv2d_cuda.h => deform_conv2d_kernel.h} (100%) diff --git a/torchvision/csrc/cpu/deform_conv2d_cpu.cpp b/torchvision/csrc/cpu/deform_conv2d_kernel.cpp similarity index 99% rename from torchvision/csrc/cpu/deform_conv2d_cpu.cpp rename to torchvision/csrc/cpu/deform_conv2d_kernel.cpp index ed18c6a8be5..b3ca51eaa10 100644 --- a/torchvision/csrc/cpu/deform_conv2d_cpu.cpp +++ b/torchvision/csrc/cpu/deform_conv2d_kernel.cpp @@ -66,7 +66,7 @@ // modified from // https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp -#include "deform_conv2d_cpu.h" +#include "deform_conv2d_kernel.h" namespace { diff --git a/torchvision/csrc/cpu/deform_conv2d_cpu.h b/torchvision/csrc/cpu/deform_conv2d_kernel.h similarity index 100% rename from torchvision/csrc/cpu/deform_conv2d_cpu.h rename to torchvision/csrc/cpu/deform_conv2d_kernel.h diff --git a/torchvision/csrc/cuda/deform_conv2d_cuda.cu b/torchvision/csrc/cuda/deform_conv2d_kernel.cu similarity index 99% rename from torchvision/csrc/cuda/deform_conv2d_cuda.cu rename to torchvision/csrc/cuda/deform_conv2d_kernel.cu index eb1d8309d0f..76a83bc82c5 100644 --- a/torchvision/csrc/cuda/deform_conv2d_cuda.cu +++ b/torchvision/csrc/cuda/deform_conv2d_kernel.cu @@ -71,8 +71,8 @@ #include #include -#include "deform_conv2d_cuda.h" #include "cuda_helpers.h" +#include "deform_conv2d_kernel.h" namespace { diff --git a/torchvision/csrc/cuda/deform_conv2d_cuda.h b/torchvision/csrc/cuda/deform_conv2d_kernel.h similarity index 100% rename from torchvision/csrc/cuda/deform_conv2d_cuda.h rename to torchvision/csrc/cuda/deform_conv2d_kernel.h diff --git a/torchvision/csrc/deform_conv2d.h b/torchvision/csrc/deform_conv2d.h index e6250a1847b..ceebae52d68 100644 --- a/torchvision/csrc/deform_conv2d.h +++ b/torchvision/csrc/deform_conv2d.h @@ -1,12 +1,12 @@ #pragma once -#include "cpu/deform_conv2d_cpu.h" +#include "cpu/deform_conv2d_kernel.h" #ifdef WITH_CUDA -#include "cuda/deform_conv2d_cuda.h" +#include "cuda/deform_conv2d_kernel.h" #endif #ifdef WITH_HIP -#include "hip/deform_conv2d_cuda.h" +#include "hip/deform_conv2d_kernel.h" #endif // Autocast Registration From f3f84694c9a26e2a914866babf456190ff0d5a00 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Thu, 26 Nov 2020 22:02:56 +0000 Subject: [PATCH 09/11] Remove the _param postfix from the variables and standardizing names. --- torchvision/csrc/cpu/deform_conv2d_kernel.cpp | 253 ++++++++--------- torchvision/csrc/cpu/deform_conv2d_kernel.h | 30 +-- torchvision/csrc/cuda/deform_conv2d_kernel.cu | 255 +++++++++--------- torchvision/csrc/cuda/deform_conv2d_kernel.h | 30 +-- 4 files changed, 295 insertions(+), 273 deletions(-) diff --git a/torchvision/csrc/cpu/deform_conv2d_kernel.cpp b/torchvision/csrc/cpu/deform_conv2d_kernel.cpp index b3ca51eaa10..f593e880b3b 100644 --- a/torchvision/csrc/cpu/deform_conv2d_kernel.cpp +++ b/torchvision/csrc/cpu/deform_conv2d_kernel.cpp @@ -125,8 +125,8 @@ void deformable_im2col_kernel( int pad_w, int stride_h, int stride_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int batch_sz, int n_in_channels, int n_offset_grps, @@ -176,8 +176,10 @@ void deformable_im2col_kernel( offset_ptr[offset_idx * (out_h * out_w) + out_y * out_w + out_x]; const scalar_t offset_w = offset_ptr [(offset_idx + 1) * (out_h * out_w) + out_y * out_w + out_x]; - const scalar_t y = (out_y * stride_h - pad_h) + i * dil_h + offset_h; - const scalar_t x = (out_x * stride_w - pad_w) + j * dil_w + offset_w; + const scalar_t y = + (out_y * stride_h - pad_h) + i * dilation_h + offset_h; + const scalar_t x = + (out_x * stride_w - pad_w) + j * dilation_w + offset_w; *columns_ptr = mask_value * bilinear_interpolate(input_ptr, height, width, y, x); columns_ptr += batch_sz * out_h * out_w; @@ -199,8 +201,8 @@ void deformable_im2col( int pad_w, int stride_h, int stride_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int out_h, int out_w, int parallel_imgs, @@ -224,8 +226,8 @@ void deformable_im2col( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, parallel_imgs, n_in_channels, deformable_group, @@ -588,8 +590,8 @@ std::tuple backward_gradient_inputs( int stride_w, int pad_h, int pad_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int n_weight_grps, int n_offset_grps, int n_parallel_imgs, @@ -605,8 +607,10 @@ std::tuple backward_gradient_inputs( int weight_h = weight.size(2); int weight_w = weight.size(3); - long out_h = (in_h + 2 * pad_h - (dil_h * (weight_h - 1) + 1)) / stride_h + 1; - long out_w = (in_w + 2 * pad_w - (dil_w * (weight_w - 1) + 1)) / stride_w + 1; + long out_h = + (in_h + 2 * pad_h - (dilation_h * (weight_h - 1) + 1)) / stride_h + 1; + long out_w = + (in_w + 2 * pad_w - (dilation_w * (weight_w - 1) + 1)) / stride_w + 1; auto grad_input = at::zeros_like(input); auto grad_offset = at::zeros_like(offset); @@ -690,8 +694,8 @@ std::tuple backward_gradient_inputs( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_parallel_imgs, n_offset_grps, use_mask, @@ -711,8 +715,8 @@ std::tuple backward_gradient_inputs( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_parallel_imgs, n_offset_grps, use_mask, @@ -741,8 +745,8 @@ at::Tensor backward_gradient_parameters( int stride_w, int pad_h, int pad_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int n_weight_grps, int n_offset_grps, int n_parallel_imgs, @@ -819,8 +823,8 @@ at::Tensor backward_gradient_parameters( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, out_h, out_w, n_parallel_imgs, @@ -848,47 +852,47 @@ at::Tensor backward_gradient_parameters( } // namespace at::Tensor deform_conv2d_forward_cpu( - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dil_h, - int64_t dil_w, + int64_t dilation_h, + int64_t dilation_w, int64_t n_weight_grps, int64_t n_offset_grps, bool use_mask) { - at::Tensor input = input_param.contiguous(); - at::Tensor offset = offset_param.contiguous(); - at::Tensor weight = weight_param.contiguous(); - at::Tensor mask = mask_param.contiguous(); - at::Tensor bias = bias_param.contiguous(); - - TORCH_CHECK(input.ndimension() == 4); - TORCH_CHECK(offset.ndimension() == 4); - TORCH_CHECK(!use_mask || mask.ndimension() == 4); - TORCH_CHECK(weight.ndimension() == 4); - TORCH_CHECK(input.device().is_cpu(), "input must be a CPU tensor"); - - int batch_sz = input.size(0); - int n_in_channels = input.size(1); - int in_h = input.size(2); - int in_w = input.size(3); + at::Tensor input_c = input.contiguous(); + at::Tensor offset_c = offset.contiguous(); + at::Tensor weight_c = weight.contiguous(); + at::Tensor mask_c = mask.contiguous(); + at::Tensor bias_c = bias.contiguous(); + + TORCH_CHECK(input_c.ndimension() == 4); + TORCH_CHECK(offset_c.ndimension() == 4); + TORCH_CHECK(!use_mask || mask_c.ndimension() == 4); + TORCH_CHECK(weight_c.ndimension() == 4); + TORCH_CHECK(input_c.device().is_cpu(), "input must be a CPU tensor"); + + int batch_sz = input_c.size(0); + int n_in_channels = input_c.size(1); + int in_h = input_c.size(2); + int in_w = input_c.size(3); int n_parallel_imgs = get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); // Unpack shapes and args - int out_channels = weight.size(0); - int weight_h = weight.size(2); - int weight_w = weight.size(3); + int out_channels = weight_c.size(0); + int weight_h = weight_c.size(2); + int weight_w = weight_c.size(3); - int ker_h = dil_h * (weight_h - 1) + 1; - int ker_w = dil_w * (weight_w - 1) + 1; + int ker_h = dilation_h * (weight_h - 1) + 1; + int ker_w = dilation_w * (weight_w - 1) + 1; int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1; int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1; @@ -905,45 +909,51 @@ at::Tensor deform_conv2d_forward_cpu( " stride_w: ", stride_w); TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w); - TORCH_CHECK(dil_h > 0 && dil_w > 0, "dil_h: ", dil_h, " dil_w: ", dil_w); - - TORCH_CHECK(weight.size(1) * n_weight_grps == input.size(1)); - TORCH_CHECK(weight.size(0) % n_weight_grps == 0); TORCH_CHECK( - (offset.size(1) == n_offset_grps * 2 * weight_h * weight_w), + dilation_h > 0 && dilation_w > 0, + "dilation_h: ", + dilation_h, + " dilation_w: ", + dilation_w); + + TORCH_CHECK(weight_c.size(1) * n_weight_grps == input_c.size(1)); + TORCH_CHECK(weight_c.size(0) % n_weight_grps == 0); + TORCH_CHECK( + (offset_c.size(1) == n_offset_grps * 2 * weight_h * weight_w), "offset.shape[1] is not valid: got: ", - offset.size(1), + offset_c.size(1), " expected: ", n_offset_grps * 2 * weight_h * weight_w); TORCH_CHECK( - (!use_mask || mask.size(1) == n_offset_grps * weight_h * weight_w), + (!use_mask || mask_c.size(1) == n_offset_grps * weight_h * weight_w), "mask.shape[1] is not valid: got: ", - mask.size(1), + mask_c.size(1), " expected: ", n_offset_grps * weight_h * weight_w); - TORCH_CHECK(input.size(1) % n_offset_grps == 0); + TORCH_CHECK(input_c.size(1) % n_offset_grps == 0); TORCH_CHECK( - (offset.size(0) == input.size(0)), "invalid batch size of offset"); + (offset_c.size(0) == input_c.size(0)), "invalid batch size of offset"); TORCH_CHECK( - (offset.size(2) == out_h && offset.size(3) == out_w), + (offset_c.size(2) == out_h && offset_c.size(3) == out_w), "offset output dims: (", - offset.size(2), + offset_c.size(2), ", ", - offset.size(3), + offset_c.size(3), ") - ", "computed output dims: (", out_h, ", ", out_w, ")"); - TORCH_CHECK((mask.size(0) == input.size(0)), "invalid batch size of mask"); TORCH_CHECK( - (!use_mask || (mask.size(2) == out_h && mask.size(3) == out_w)), + (mask_c.size(0) == input_c.size(0)), "invalid batch size of mask"); + TORCH_CHECK( + (!use_mask || (mask_c.size(2) == out_h && mask_c.size(3) == out_w)), "offset output dims: (", - mask.size(2), + mask_c.size(2), ", ", - mask.size(3), + mask_c.size(3), ") - ", "computed output dims: (", out_h, @@ -957,7 +967,8 @@ at::Tensor deform_conv2d_forward_cpu( " out_w: ", out_w); - auto out = at::zeros({batch_sz, out_channels, out_h, out_w}, input.options()); + auto out = + at::zeros({batch_sz, out_channels, out_h, out_w}, input_c.options()); if (batch_sz == 0) { return out; } @@ -968,21 +979,21 @@ at::Tensor deform_conv2d_forward_cpu( out_channels, out_h, out_w}); - input = input.view( + input_c = input_c.view( {batch_sz / n_parallel_imgs, n_parallel_imgs, n_in_channels, in_h, in_w}); - offset = offset.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - n_offset_grps * 2 * weight_h * weight_w, - out_h, - out_w}); + offset_c = offset_c.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + n_offset_grps * 2 * weight_h * weight_w, + out_h, + out_w}); if (use_mask) { - mask = mask.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - n_offset_grps * weight_h * weight_w, - out_h, - out_w}); + mask_c = mask_c.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + n_offset_grps * weight_h * weight_w, + out_h, + out_w}); } at::Tensor out_buf = at::zeros( @@ -998,21 +1009,21 @@ at::Tensor deform_conv2d_forward_cpu( out_buf.size(1) / n_weight_grps, out_buf.size(2), out_buf.size(3)}); - weight = weight.view({n_weight_grps, - weight.size(0) / n_weight_grps, - weight.size(1), - weight.size(2), - weight.size(3)}); + weight_c = weight_c.view({n_weight_grps, + weight_c.size(0) / n_weight_grps, + weight_c.size(1), + weight_c.size(2), + weight_c.size(3)}); // Sample points and perform convolution auto columns = at::zeros( {n_in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w}, - input.options()); + input_c.options()); for (int b = 0; b < batch_sz / n_parallel_imgs; b++) { deformable_im2col( - input[b], - offset[b], - mask[b], + input_c[b], + offset_c[b], + mask_c[b], n_in_channels, in_h, in_w, @@ -1022,8 +1033,8 @@ at::Tensor deform_conv2d_forward_cpu( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, out_h, out_w, n_parallel_imgs, @@ -1036,7 +1047,7 @@ at::Tensor deform_conv2d_forward_cpu( for (int g = 0; g < n_weight_grps; g++) { out_buf[b][g] = out_buf[b][g] .flatten(1) - .addmm_(weight[g].flatten(1), columns[g]) + .addmm_(weight_c[g].flatten(1), columns[g]) .view_as(out_buf[b][g]); } columns = @@ -1052,49 +1063,49 @@ at::Tensor deform_conv2d_forward_cpu( out.copy_(out_buf); out = out.view({batch_sz, out_channels, out_h, out_w}); - return out + bias.view({1, out_channels, 1, 1}); + return out + bias_c.view({1, out_channels, 1, 1}); } std::tuple deform_conv2d_backward_cpu( - const at::Tensor& grad_out_param, - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, + const at::Tensor& grad_out, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dil_h, - int64_t dil_w, + int64_t dilation_h, + int64_t dilation_w, int64_t n_weight_grps, int64_t n_offset_grps, bool use_mask) { - at::Tensor grad_out = grad_out_param.contiguous(); - at::Tensor input = input_param.contiguous(); - at::Tensor weight = weight_param.contiguous(); - at::Tensor offset = offset_param.contiguous(); - at::Tensor mask = mask_param.contiguous(); - at::Tensor bias = bias_param.contiguous(); - - const int batch_sz = input.size(0); + at::Tensor grad_out_c = grad_out.contiguous(); + at::Tensor input_c = input.contiguous(); + at::Tensor weight_c = weight.contiguous(); + at::Tensor offset_c = offset.contiguous(); + at::Tensor mask_c = mask.contiguous(); + at::Tensor bias_c = bias.contiguous(); + + const int batch_sz = input_c.size(0); const int n_parallel_imgs = get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); auto grad_input_and_offset_and_mask = backward_gradient_inputs( - input, - weight, - offset, - mask, - grad_out, + input_c, + weight_c, + offset_c, + mask_c, + grad_out_c, stride_h, stride_w, pad_h, pad_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_weight_grps, n_offset_grps, n_parallel_imgs, @@ -1105,23 +1116,23 @@ deform_conv2d_backward_cpu( auto grad_mask = std::get<2>(grad_input_and_offset_and_mask); auto grad_weight = backward_gradient_parameters( - input, - weight, - offset, - mask, - grad_out, + input_c, + weight_c, + offset_c, + mask_c, + grad_out_c, stride_h, stride_w, pad_h, pad_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_weight_grps, n_offset_grps, n_parallel_imgs, use_mask); - auto grad_bias = at::ones_like(bias) * grad_out.sum({0, 2, 3}); + auto grad_bias = at::ones_like(bias_c) * grad_out_c.sum({0, 2, 3}); return std::make_tuple( grad_input, grad_weight, grad_offset, grad_mask, grad_bias); diff --git a/torchvision/csrc/cpu/deform_conv2d_kernel.h b/torchvision/csrc/cpu/deform_conv2d_kernel.h index 2eb9375ca1a..2eb5ab37c6e 100644 --- a/torchvision/csrc/cpu/deform_conv2d_kernel.h +++ b/torchvision/csrc/cpu/deform_conv2d_kernel.h @@ -4,17 +4,17 @@ #include "../macros.h" VISION_API at::Tensor deform_conv2d_forward_cpu( - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dil_h, - int64_t dil_w, + int64_t dilation_h, + int64_t dilation_w, int64_t n_weight_grps, int64_t n_offset_grps, bool use_mask); @@ -22,18 +22,18 @@ VISION_API at::Tensor deform_conv2d_forward_cpu( VISION_API std:: tuple deform_conv2d_backward_cpu( - const at::Tensor& grad_out_param, - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, + const at::Tensor& grad_out, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dil_h, - int64_t dil_w, + int64_t dilation_h, + int64_t dilation_w, int64_t n_weight_grps, int64_t n_offset_grps, bool use_mask); diff --git a/torchvision/csrc/cuda/deform_conv2d_kernel.cu b/torchvision/csrc/cuda/deform_conv2d_kernel.cu index 76a83bc82c5..6edaa9c73af 100644 --- a/torchvision/csrc/cuda/deform_conv2d_kernel.cu +++ b/torchvision/csrc/cuda/deform_conv2d_kernel.cu @@ -147,8 +147,8 @@ __global__ void deformable_im2col_kernel( int pad_w, int stride_h, int stride_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int batch_sz, int n_in_channels, int n_offset_grps, @@ -196,8 +196,10 @@ __global__ void deformable_im2col_kernel( offset_ptr[offset_idx * (out_h * out_w) + out_y * out_w + out_x]; const scalar_t offset_w = offset_ptr [(offset_idx + 1) * (out_h * out_w) + out_y * out_w + out_x]; - const scalar_t y = (out_y * stride_h - pad_h) + i * dil_h + offset_h; - const scalar_t x = (out_x * stride_w - pad_w) + j * dil_w + offset_w; + const scalar_t y = + (out_y * stride_h - pad_h) + i * dilation_h + offset_h; + const scalar_t x = + (out_x * stride_w - pad_w) + j * dilation_w + offset_w; *columns_ptr = mask_value * bilinear_interpolate(input_ptr, height, width, y, x); columns_ptr += batch_sz * out_h * out_w; @@ -219,8 +221,8 @@ void deformable_im2col( int pad_w, int stride_h, int stride_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int out_h, int out_w, int parallel_imgs, @@ -249,8 +251,8 @@ void deformable_im2col( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, parallel_imgs, n_in_channels, deformable_group, @@ -634,8 +636,8 @@ std::tuple backward_gradient_inputs( int stride_w, int pad_h, int pad_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int n_weight_grps, int n_offset_grps, int n_parallel_imgs, @@ -653,8 +655,10 @@ std::tuple backward_gradient_inputs( int weight_h = weight.size(2); int weight_w = weight.size(3); - long out_w = (in_w + 2 * pad_w - (dil_w * (weight_w - 1) + 1)) / stride_w + 1; - long out_h = (in_h + 2 * pad_h - (dil_h * (weight_h - 1) + 1)) / stride_h + 1; + long out_w = + (in_w + 2 * pad_w - (dilation_w * (weight_w - 1) + 1)) / stride_w + 1; + long out_h = + (in_h + 2 * pad_h - (dilation_h * (weight_h - 1) + 1)) / stride_h + 1; auto grad_input = at::zeros_like(input); auto grad_offset = at::zeros_like(offset); @@ -737,8 +741,8 @@ std::tuple backward_gradient_inputs( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_parallel_imgs, n_offset_grps, use_mask, @@ -758,8 +762,8 @@ std::tuple backward_gradient_inputs( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_parallel_imgs, n_offset_grps, use_mask, @@ -788,8 +792,8 @@ at::Tensor backward_gradient_parameters( int stride_w, int pad_h, int pad_w, - int dil_h, - int dil_w, + int dilation_h, + int dilation_w, int n_weight_grps, int n_offset_grps, int n_parallel_imgs, @@ -868,8 +872,8 @@ at::Tensor backward_gradient_parameters( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, out_h, out_w, n_parallel_imgs, @@ -897,48 +901,48 @@ at::Tensor backward_gradient_parameters( } // namespace at::Tensor deform_conv2d_forward_cuda( - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dil_h, - int64_t dil_w, + int64_t dilation_h, + int64_t dilation_w, int64_t n_weight_grps, int64_t n_offset_grps, bool use_mask) { - at::Tensor input = input_param.contiguous(); - at::Tensor offset = offset_param.contiguous(); - at::Tensor weight = weight_param.contiguous(); - at::Tensor mask = mask_param.contiguous(); - at::Tensor bias = bias_param.contiguous(); - - TORCH_CHECK(input.ndimension() == 4); - TORCH_CHECK(offset.ndimension() == 4); - TORCH_CHECK(!use_mask || mask.ndimension() == 4); - TORCH_CHECK(weight.ndimension() == 4); - TORCH_CHECK(input.is_cuda(), "input must be a CUDA tensor"); + at::Tensor input_c = input.contiguous(); + at::Tensor offset_c = offset.contiguous(); + at::Tensor weight_c = weight.contiguous(); + at::Tensor mask_c = mask.contiguous(); + at::Tensor bias_c = bias.contiguous(); - at::DeviceGuard guard(input.device()); + TORCH_CHECK(input_c.ndimension() == 4); + TORCH_CHECK(offset_c.ndimension() == 4); + TORCH_CHECK(!use_mask || mask_c.ndimension() == 4); + TORCH_CHECK(weight_c.ndimension() == 4); + TORCH_CHECK(input_c.is_cuda(), "input must be a CUDA tensor"); - int batch_sz = input.size(0); - int in_channels = input.size(1); - int in_h = input.size(2); - int in_w = input.size(3); + at::DeviceGuard guard(input_c.device()); + + int batch_sz = input_c.size(0); + int in_channels = input_c.size(1); + int in_h = input_c.size(2); + int in_w = input_c.size(3); int n_parallel_imgs = get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); - int out_channels = weight.size(0); - int weight_h = weight.size(2); - int weight_w = weight.size(3); + int out_channels = weight_c.size(0); + int weight_h = weight_c.size(2); + int weight_w = weight_c.size(3); - int ker_h = dil_h * (weight_h - 1) + 1; - int ker_w = dil_w * (weight_w - 1) + 1; + int ker_h = dilation_h * (weight_h - 1) + 1; + int ker_w = dilation_w * (weight_w - 1) + 1; int out_h = ((in_h + 2 * pad_h - ker_h) / stride_h) + 1; int out_w = ((in_w + 2 * pad_w - ker_w) / stride_w) + 1; @@ -955,45 +959,51 @@ at::Tensor deform_conv2d_forward_cuda( " stride_w: ", stride_w); TORCH_CHECK(pad_h >= 0 && pad_w >= 0, "pad_h: ", pad_h, " pad_w: ", pad_w); - TORCH_CHECK(dil_h > 0 && dil_w > 0, "dil_h: ", dil_h, " dil_w: ", dil_w); - - TORCH_CHECK(weight.size(1) * n_weight_grps == input.size(1)); - TORCH_CHECK(weight.size(0) % n_weight_grps == 0); TORCH_CHECK( - (offset.size(1) == n_offset_grps * 2 * weight_h * weight_w), + dilation_h > 0 && dilation_w > 0, + "dilation_h: ", + dilation_h, + " dilation_w: ", + dilation_w); + + TORCH_CHECK(weight_c.size(1) * n_weight_grps == input_c.size(1)); + TORCH_CHECK(weight_c.size(0) % n_weight_grps == 0); + TORCH_CHECK( + (offset_c.size(1) == n_offset_grps * 2 * weight_h * weight_w), "offset.shape[1] is not valid: got: ", - offset.size(1), + offset_c.size(1), " expected: ", n_offset_grps * 2 * weight_h * weight_w); TORCH_CHECK( - (!use_mask || mask.size(1) == n_offset_grps * weight_h * weight_w), + (!use_mask || mask_c.size(1) == n_offset_grps * weight_h * weight_w), "mask.shape[1] is not valid: got: ", - mask.size(1), + mask_c.size(1), " expected: ", n_offset_grps * weight_h * weight_w); - TORCH_CHECK(input.size(1) % n_offset_grps == 0); + TORCH_CHECK(input_c.size(1) % n_offset_grps == 0); TORCH_CHECK( - (offset.size(0) == input.size(0)), "invalid batch size of offset"); + (offset_c.size(0) == input_c.size(0)), "invalid batch size of offset"); TORCH_CHECK( - (offset.size(2) == out_h && offset.size(3) == out_w), + (offset_c.size(2) == out_h && offset_c.size(3) == out_w), "offset output dims: (", - offset.size(2), + offset_c.size(2), ", ", - offset.size(3), + offset_c.size(3), ") - ", "computed output dims: (", out_h, ", ", out_w, ")"); - TORCH_CHECK((mask.size(0) == input.size(0)), "invalid batch size of mask"); TORCH_CHECK( - (!use_mask || (mask.size(2) == out_h && mask.size(3) == out_w)), + (mask_c.size(0) == input_c.size(0)), "invalid batch size of mask"); + TORCH_CHECK( + (!use_mask || (mask_c.size(2) == out_h && mask_c.size(3) == out_w)), "mask output dims: (", - mask.size(2), + mask_c.size(2), ", ", - mask.size(3), + mask_c.size(3), ") - ", "computed output dims: (", out_h, @@ -1007,7 +1017,8 @@ at::Tensor deform_conv2d_forward_cuda( " out_w: ", out_w); - auto out = at::zeros({batch_sz, out_channels, out_h, out_w}, input.options()); + auto out = + at::zeros({batch_sz, out_channels, out_h, out_w}, input_c.options()); if (batch_sz == 0) { return out; } @@ -1018,21 +1029,21 @@ at::Tensor deform_conv2d_forward_cuda( out_channels, out_h, out_w}); - input = input.view( + input_c = input_c.view( {batch_sz / n_parallel_imgs, n_parallel_imgs, in_channels, in_h, in_w}); - offset = offset.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - n_offset_grps * 2 * weight_h * weight_w, - out_h, - out_w}); + offset_c = offset_c.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + n_offset_grps * 2 * weight_h * weight_w, + out_h, + out_w}); if (use_mask) { - mask = mask.view({batch_sz / n_parallel_imgs, - n_parallel_imgs, - n_offset_grps * weight_h * weight_w, - out_h, - out_w}); + mask_c = mask_c.view({batch_sz / n_parallel_imgs, + n_parallel_imgs, + n_offset_grps * weight_h * weight_w, + out_h, + out_w}); } at::Tensor out_buf = at::zeros( @@ -1048,21 +1059,21 @@ at::Tensor deform_conv2d_forward_cuda( out_buf.size(1) / n_weight_grps, out_buf.size(2), out_buf.size(3)}); - weight = weight.view({n_weight_grps, - weight.size(0) / n_weight_grps, - weight.size(1), - weight.size(2), - weight.size(3)}); + weight_c = weight_c.view({n_weight_grps, + weight_c.size(0) / n_weight_grps, + weight_c.size(1), + weight_c.size(2), + weight_c.size(3)}); // Sample points and perform convolution auto columns = at::zeros( {in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w}, - input.options()); + input_c.options()); for (int b = 0; b < batch_sz / n_parallel_imgs; b++) { deformable_im2col( - input[b], - offset[b], - mask[b], + input_c[b], + offset_c[b], + mask_c[b], in_channels, in_h, in_w, @@ -1072,8 +1083,8 @@ at::Tensor deform_conv2d_forward_cuda( pad_w, stride_h, stride_w, - dil_h, - dil_w, + dilation_h, + dilation_w, out_h, out_w, n_parallel_imgs, @@ -1086,7 +1097,7 @@ at::Tensor deform_conv2d_forward_cuda( for (int g = 0; g < n_weight_grps; g++) { out_buf[b][g] = out_buf[b][g] .flatten(1) - .addmm_(weight[g].flatten(1), columns[g]) + .addmm_(weight_c[g].flatten(1), columns[g]) .view_as(out_buf[b][g]); } columns = @@ -1102,49 +1113,49 @@ at::Tensor deform_conv2d_forward_cuda( out.copy_(out_buf); out = out.view({batch_sz, out_channels, out_h, out_w}); - return out + bias.view({1, out_channels, 1, 1}); + return out + bias_c.view({1, out_channels, 1, 1}); } std::tuple deform_conv2d_backward_cuda( - const at::Tensor& grad_out_param, - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, + const at::Tensor& grad_out, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dil_h, - int64_t dil_w, + int64_t dilation_h, + int64_t dilation_w, int64_t n_weight_grps, int64_t n_offset_grps, bool use_mask) { - at::Tensor grad_out = grad_out_param.contiguous(); - at::Tensor input = input_param.contiguous(); - at::Tensor weight = weight_param.contiguous(); - at::Tensor offset = offset_param.contiguous(); - at::Tensor mask = mask_param.contiguous(); - at::Tensor bias = bias_param.contiguous(); - - const int batch_sz = input.size(0); + at::Tensor grad_out_c = grad_out.contiguous(); + at::Tensor input_c = input.contiguous(); + at::Tensor weight_c = weight.contiguous(); + at::Tensor offset_c = offset.contiguous(); + at::Tensor mask_c = mask.contiguous(); + at::Tensor bias_c = bias.contiguous(); + + const int batch_sz = input_c.size(0); const int n_parallel_imgs = get_greatest_divisor_below_bound(batch_sz, kMaxParallelImgs); auto grad_input_and_offset_and_mask = backward_gradient_inputs( - input, - weight, - offset, - mask, - grad_out, + input_c, + weight_c, + offset_c, + mask_c, + grad_out_c, stride_h, stride_w, pad_h, pad_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_weight_grps, n_offset_grps, n_parallel_imgs, @@ -1155,24 +1166,24 @@ deform_conv2d_backward_cuda( auto grad_mask = std::get<2>(grad_input_and_offset_and_mask); auto grad_weight = backward_gradient_parameters( - input, - weight, - offset, - mask, - grad_out, + input_c, + weight_c, + offset_c, + mask_c, + grad_out_c, stride_h, stride_w, pad_h, pad_w, - dil_h, - dil_w, + dilation_h, + dilation_w, n_weight_grps, n_offset_grps, n_parallel_imgs, use_mask); - auto value = grad_out.sum({0, 2, 3}); - auto grad_bias = at::ones_like(bias) * value; + auto value = grad_out_c.sum({0, 2, 3}); + auto grad_bias = at::ones_like(bias_c) * value; return std::make_tuple( grad_input, grad_weight, grad_offset, grad_mask, grad_bias); diff --git a/torchvision/csrc/cuda/deform_conv2d_kernel.h b/torchvision/csrc/cuda/deform_conv2d_kernel.h index c9afe68849d..00f3f3dc15d 100644 --- a/torchvision/csrc/cuda/deform_conv2d_kernel.h +++ b/torchvision/csrc/cuda/deform_conv2d_kernel.h @@ -4,17 +4,17 @@ #include "../macros.h" VISION_API at::Tensor deform_conv2d_forward_cuda( - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dil_h, - int64_t dil_w, + int64_t dilation_h, + int64_t dilation_w, int64_t n_weight_grps, int64_t n_offset_grps, bool use_mask); @@ -22,18 +22,18 @@ VISION_API at::Tensor deform_conv2d_forward_cuda( VISION_API std:: tuple deform_conv2d_backward_cuda( - const at::Tensor& grad_out_param, - const at::Tensor& input_param, - const at::Tensor& weight_param, - const at::Tensor& offset_param, - const at::Tensor& mask_param, - const at::Tensor& bias_param, + const at::Tensor& grad_out, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w, - int64_t dil_h, - int64_t dil_w, + int64_t dilation_h, + int64_t dilation_w, int64_t n_weight_grps, int64_t n_offset_grps, bool use_mask); From da80ce16e72a5f60d7c1d06ec9e0e042aa993a3b Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 30 Nov 2020 17:20:18 +0000 Subject: [PATCH 10/11] Exposing public forward/backward methods to the C++ API and moving methods around to minimize git blame changes. --- torchvision/csrc/deform_conv2d.cpp | 76 +++++++++++++++--------------- torchvision/csrc/deform_conv2d.h | 40 +++++++++++++++- 2 files changed, 76 insertions(+), 40 deletions(-) diff --git a/torchvision/csrc/deform_conv2d.cpp b/torchvision/csrc/deform_conv2d.cpp index 062a6a3e9d9..74ba630537a 100644 --- a/torchvision/csrc/deform_conv2d.cpp +++ b/torchvision/csrc/deform_conv2d.cpp @@ -5,8 +5,6 @@ #include #endif -namespace { - at::Tensor deform_conv2d( const at::Tensor& input, const at::Tensor& weight, @@ -42,6 +40,42 @@ at::Tensor deform_conv2d( use_mask); } +#if defined(WITH_CUDA) || defined(WITH_HIP) +at::Tensor deform_conv2d_autocast( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t groups, + int64_t offset_groups, + bool use_mask) { + c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast); + return deform_conv2d( + at::autocast::cached_cast(at::kFloat, input), + at::autocast::cached_cast(at::kFloat, weight), + at::autocast::cached_cast(at::kFloat, offset), + at::autocast::cached_cast(at::kFloat, mask), + at::autocast::cached_cast(at::kFloat, bias), + stride_h, + stride_w, + pad_h, + pad_w, + dilation_h, + dilation_w, + groups, + offset_groups, + use_mask) + .to(input.scalar_type()); +} +#endif + std::tuple _deform_conv2d_backward( const at::Tensor& grad, @@ -81,6 +115,8 @@ _deform_conv2d_backward( use_mask); } +namespace { + class DeformConv2dFunction : public torch::autograd::Function { public: @@ -257,42 +293,6 @@ class DeformConv2dBackwardFunction } // namespace -#if defined(WITH_CUDA) || defined(WITH_HIP) -at::Tensor deform_conv2d_autocast( - const at::Tensor& input, - const at::Tensor& weight, - const at::Tensor& offset, - const at::Tensor& mask, - const at::Tensor& bias, - int64_t stride_h, - int64_t stride_w, - int64_t pad_h, - int64_t pad_w, - int64_t dilation_h, - int64_t dilation_w, - int64_t groups, - int64_t offset_groups, - bool use_mask) { - c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast); - return deform_conv2d( - at::autocast::cached_cast(at::kFloat, input), - at::autocast::cached_cast(at::kFloat, weight), - at::autocast::cached_cast(at::kFloat, offset), - at::autocast::cached_cast(at::kFloat, mask), - at::autocast::cached_cast(at::kFloat, bias), - stride_h, - stride_w, - pad_h, - pad_w, - dilation_h, - dilation_w, - groups, - offset_groups, - use_mask) - .to(input.scalar_type()); -} -#endif - at::Tensor deform_conv2d_autograd( const at::Tensor& input, const at::Tensor& weight, diff --git a/torchvision/csrc/deform_conv2d.h b/torchvision/csrc/deform_conv2d.h index ceebae52d68..6adc77fb888 100644 --- a/torchvision/csrc/deform_conv2d.h +++ b/torchvision/csrc/deform_conv2d.h @@ -9,7 +9,24 @@ #include "hip/deform_conv2d_kernel.h" #endif -// Autocast Registration +// C++ Forward +at::Tensor deform_conv2d( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t groups, + int64_t offset_groups, + bool use_mask); + +// Autocast Forward #if defined(WITH_CUDA) || defined(WITH_HIP) at::Tensor deform_conv2d_autocast( const at::Tensor& input, @@ -28,7 +45,26 @@ at::Tensor deform_conv2d_autocast( bool use_mask); #endif -// Autograd Registration +// C++ Backward +std::tuple +_deform_conv2d_backward( + const at::Tensor& grad, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& offset, + const at::Tensor& mask, + const at::Tensor& bias, + int64_t stride_h, + int64_t stride_w, + int64_t pad_h, + int64_t pad_w, + int64_t dilation_h, + int64_t dilation_w, + int64_t groups, + int64_t offset_groups, + bool use_mask); + +// Autograd Forward and Backward at::Tensor deform_conv2d_autograd( const at::Tensor& input, const at::Tensor& weight, From df0e8a7a70742b1697785fb9a5ac4bbf51249962 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 30 Nov 2020 21:32:52 +0000 Subject: [PATCH 11/11] Registering operators in their files. --- torchvision/csrc/cpu/deform_conv2d_kernel.cpp | 7 +++++++ torchvision/csrc/cuda/deform_conv2d_kernel.cu | 6 ++++++ torchvision/csrc/deform_conv2d.cpp | 9 +++++++++ torchvision/csrc/vision.cpp | 7 ------- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/torchvision/csrc/cpu/deform_conv2d_kernel.cpp b/torchvision/csrc/cpu/deform_conv2d_kernel.cpp index f593e880b3b..a3bd26c82a3 100644 --- a/torchvision/csrc/cpu/deform_conv2d_kernel.cpp +++ b/torchvision/csrc/cpu/deform_conv2d_kernel.cpp @@ -66,6 +66,8 @@ // modified from // https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp +#include + #include "deform_conv2d_kernel.h" namespace { @@ -1137,3 +1139,8 @@ deform_conv2d_backward_cpu( return std::make_tuple( grad_input, grad_weight, grad_offset, grad_mask, grad_bias); } + +TORCH_LIBRARY_IMPL(torchvision, CPU, m) { + m.impl("deform_conv2d", deform_conv2d_forward_cpu); + m.impl("_deform_conv2d_backward", deform_conv2d_backward_cpu); +} diff --git a/torchvision/csrc/cuda/deform_conv2d_kernel.cu b/torchvision/csrc/cuda/deform_conv2d_kernel.cu index 6edaa9c73af..8e2ae9032b1 100644 --- a/torchvision/csrc/cuda/deform_conv2d_kernel.cu +++ b/torchvision/csrc/cuda/deform_conv2d_kernel.cu @@ -70,6 +70,7 @@ #include #include #include +#include #include "cuda_helpers.h" #include "deform_conv2d_kernel.h" @@ -1188,3 +1189,8 @@ deform_conv2d_backward_cuda( return std::make_tuple( grad_input, grad_weight, grad_offset, grad_mask, grad_bias); } + +TORCH_LIBRARY_IMPL(torchvision, CUDA, m) { + m.impl("deform_conv2d", deform_conv2d_forward_cuda); + m.impl("_deform_conv2d_backward", deform_conv2d_backward_cuda); +} diff --git a/torchvision/csrc/deform_conv2d.cpp b/torchvision/csrc/deform_conv2d.cpp index 74ba630537a..66b91986c2c 100644 --- a/torchvision/csrc/deform_conv2d.cpp +++ b/torchvision/csrc/deform_conv2d.cpp @@ -74,6 +74,10 @@ at::Tensor deform_conv2d_autocast( use_mask) .to(input.scalar_type()); } + +TORCH_LIBRARY_IMPL(torchvision, Autocast, m) { + m.impl("deform_conv2d", deform_conv2d_autocast); +} #endif std::tuple @@ -361,3 +365,8 @@ deform_conv2d_backward_autograd( return std::make_tuple(result[0], result[1], result[2], result[3], result[4]); } + +TORCH_LIBRARY_IMPL(torchvision, Autograd, m) { + m.impl("deform_conv2d", deform_conv2d_autograd); + m.impl("_deform_conv2d_backward", deform_conv2d_backward_autograd); +} diff --git a/torchvision/csrc/vision.cpp b/torchvision/csrc/vision.cpp index 2d4e2af0f53..a7517e5603b 100644 --- a/torchvision/csrc/vision.cpp +++ b/torchvision/csrc/vision.cpp @@ -62,8 +62,6 @@ TORCH_LIBRARY(torchvision, m) { } TORCH_LIBRARY_IMPL(torchvision, CPU, m) { - m.impl("deform_conv2d", deform_conv2d_forward_cpu); - m.impl("_deform_conv2d_backward", deform_conv2d_backward_cpu); m.impl("nms", nms_cpu); m.impl("ps_roi_align", PSROIAlign_forward_cpu); m.impl("_ps_roi_align_backward", PSROIAlign_backward_cpu); @@ -78,8 +76,6 @@ TORCH_LIBRARY_IMPL(torchvision, CPU, m) { // TODO: Place this in a hypothetical separate torchvision_cuda library #if defined(WITH_CUDA) || defined(WITH_HIP) TORCH_LIBRARY_IMPL(torchvision, CUDA, m) { - m.impl("deform_conv2d", deform_conv2d_forward_cuda); - m.impl("_deform_conv2d_backward", deform_conv2d_backward_cuda); m.impl("nms", nms_cuda); m.impl("ps_roi_align", PSROIAlign_forward_cuda); m.impl("_ps_roi_align_backward", PSROIAlign_backward_cuda); @@ -95,7 +91,6 @@ TORCH_LIBRARY_IMPL(torchvision, CUDA, m) { // Autocast only needs to wrap forward pass ops. #if defined(WITH_CUDA) || defined(WITH_HIP) TORCH_LIBRARY_IMPL(torchvision, Autocast, m) { - m.impl("deform_conv2d", deform_conv2d_autocast); m.impl("nms", nms_autocast); m.impl("ps_roi_align", PSROIAlign_autocast); m.impl("ps_roi_pool", PSROIPool_autocast); @@ -105,8 +100,6 @@ TORCH_LIBRARY_IMPL(torchvision, Autocast, m) { #endif TORCH_LIBRARY_IMPL(torchvision, Autograd, m) { - m.impl("deform_conv2d", deform_conv2d_autograd); - m.impl("_deform_conv2d_backward", deform_conv2d_backward_autograd); m.impl("ps_roi_align", PSROIAlign_autograd); m.impl("_ps_roi_align_backward", PSROIAlign_backward_autograd); m.impl("ps_roi_pool", PSROIPool_autograd);