Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds normalization options for fft and ifft #456

Merged
merged 6 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions examples/fft_conv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
if (i == 1) {
cudaEventRecord(start, stream);
}
fft_impl(sig_freq, sig_time, 0, stream);
fft_impl(filt_freq, filt_time, 0, stream);
(sig_freq = fft(sig_time)).run(stream);
(filt_freq = fft(filt_time)).run(stream);

(sig_freq = sig_freq * filt_freq).run(stream);

// IFFT in-place
(sig_freq = ifft(sig_freq)).run(stream);

}

cudaEventRecord(stop, stream);
Expand Down
2 changes: 1 addition & 1 deletion include/matx/core/type_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,7 @@ using extract_scalar_type_t = typename detail::extract_scalar_type_impl<T>::scal
* @tparam T Type to convert
*/
template <typename T>
using promote_half_t = typename std::conditional_t<is_half_v<T>, float, T>;
using promote_half_t = typename std::conditional_t<is_half_v<T> || is_matx_half_v<T>, float, T>;



Expand Down
3 changes: 2 additions & 1 deletion include/matx/operators/dct.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "matx/core/error.h"
#include "matx/core/tensor.h"
#include "matx/core/type_utils.h"
#include "matx/transforms/fft.h"

namespace matx {

Expand Down Expand Up @@ -102,7 +103,7 @@ void dct(OutputTensor &out, const InputTensor &in,

tensor_t<cuda::std::complex<typename OutputTensor::scalar_type>, 1> tmp{{N + 1}};

fft_impl(tmp, in, 0, stream);
fft_impl(tmp, in, 0, FFTNorm::BACKWARD, stream);
auto s = tmp.Slice({0}, {N});
detail::dctOp(out, s, N).run(stream);
}
Expand Down
36 changes: 23 additions & 13 deletions include/matx/operators/fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ namespace matx
uint64_t fft_size_;
PermDims perm_;
FFTType type_;
FFTNorm norm_;
std::array<index_t, OpA::Rank()> out_dims_;
matx::tensor_t<std::conditional_t<is_complex_v<typename OpA::scalar_type>,
typename OpA::scalar_type,
Expand All @@ -68,7 +69,8 @@ namespace matx
}
}

__MATX_INLINE__ FFTOp(OpA a, uint64_t size, PermDims perm, FFTType t) : a_(a), fft_size_(size), perm_(perm), type_(t) {
__MATX_INLINE__ FFTOp(OpA a, uint64_t size, PermDims perm, FFTType t, FFTNorm norm) :
a_(a), fft_size_(size), perm_(perm), type_(t), norm_(norm) {
for (int r = 0; r < Rank(); r++) {
out_dims_[r] = a_.Size(r);
}
Expand Down Expand Up @@ -113,18 +115,18 @@ namespace matx
static_assert(is_device_executor_v<Executor>, "fft()/ifft() only supports the CUDA executor currently");
if constexpr (std::is_same_v<PermDims, no_permute_t>) {
if constexpr (std::is_same_v<FFTType, fft_t>) {
fft_impl(std::get<0>(out), a_, fft_size_, ex.getStream());
fft_impl(std::get<0>(out), a_, fft_size_, norm_, ex.getStream());
}
else {
ifft_impl(std::get<0>(out), a_, fft_size_, ex.getStream());
ifft_impl(std::get<0>(out), a_, fft_size_, norm_, ex.getStream());
}
}
else {
if constexpr (std::is_same_v<FFTType, fft_t>) {
fft_impl(permute(std::get<0>(out), perm_), permute(a_, perm_), fft_size_, ex.getStream());
fft_impl(permute(std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex.getStream());
}
else {
ifft_impl(permute(std::get<0>(out), perm_), permute(a_, perm_), fft_size_, ex.getStream());
ifft_impl(permute(std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex.getStream());
}
}
}
Expand Down Expand Up @@ -161,10 +163,12 @@ namespace matx
* input tensor or operator
* @param fft_size
* Size of FFT. Setting to 0 uses the output size to figure out the FFT size.
* @param norm
* Normalization to apply to FFT
*/
template<typename OpA>
__MATX_INLINE__ auto fft(OpA &&a, uint64_t fft_size = 0) {
return detail::FFTOp(a, fft_size, detail::no_permute_t{}, detail::fft_t{});
__MATX_INLINE__ auto fft(OpA &&a, uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD) {
return detail::FFTOp(a, fft_size, detail::no_permute_t{}, detail::fft_t{}, norm);
}

/**
Expand All @@ -184,12 +188,14 @@ namespace matx
* axis to perform FFT along
* @param fft_size
* Size of FFT. Setting to 0 uses the output size to figure out the FFT size.
* @param norm
* Normalization to apply to FFT
*/
template<typename OpA>
__MATX_INLINE__ auto fft(OpA &&a, const int32_t (&axis)[1], uint64_t fft_size = 0) {
__MATX_INLINE__ auto fft(OpA &&a, const int32_t (&axis)[1], uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD) {

auto perm = detail::getPermuteDims<remove_cvref_t<OpA>::Rank()>(axis);
return detail::FFTOp(a, fft_size, perm, detail::fft_t{});
return detail::FFTOp(a, fft_size, perm, detail::fft_t{}, norm);
}

/**
Expand All @@ -208,10 +214,12 @@ namespace matx
* input tensor or operator
* @param fft_size
* Size of FFT. Setting to 0 uses the output size to figure out the FFT size.
* @param norm
* Normalization to apply to FFT
*/
template<typename OpA>
__MATX_INLINE__ auto ifft(OpA &&a, uint64_t fft_size = 0) {
return detail::FFTOp(a, fft_size, detail::no_permute_t{}, detail::ifft_t{});
__MATX_INLINE__ auto ifft(OpA &&a, uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD) {
return detail::FFTOp(a, fft_size, detail::no_permute_t{}, detail::ifft_t{}, norm);
}

/**
Expand All @@ -231,11 +239,13 @@ namespace matx
* axis to perform FFT along
* @param fft_size
* Size of FFT. Setting to 0 uses the output size to figure out the FFT size.
* @param norm
* Normalization to apply to FFT
*/
template<typename OpA>
__MATX_INLINE__ auto ifft(OpA &&a, const int32_t (&axis)[1], uint64_t fft_size = 0) {
__MATX_INLINE__ auto ifft(OpA &&a, const int32_t (&axis)[1], uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD) {
auto perm = detail::getPermuteDims<remove_cvref_t<OpA>::Rank()>(axis);
return detail::FFTOp(a, fft_size, perm, detail::ifft_t{});
return detail::FFTOp(a, fft_size, perm, detail::ifft_t{}, norm);
}


Expand Down
15 changes: 8 additions & 7 deletions include/matx/transforms/ambgfun.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "matx/core/type_utils.h"

#include "matx/transforms/copy.h"
#include "matx/transforms/fft.h"

namespace matx {
typedef enum {
Expand Down Expand Up @@ -208,7 +209,7 @@ void ambgfun_impl(AMFTensor &amf, XTensor &x,
(fullfft = 0).run(stream);
matx::copy(partfft, new_ynorm_v, stream);

ifft_impl(fullfft, fullfft, 0, stream);
ifft_impl(fullfft, fullfft, 0, FFTNorm::BACKWARD, stream);

// We need to temporarily allocate a complex output version of AMF since we
// have no way to convert complex to real in an operator currently
Expand All @@ -223,17 +224,17 @@ void ambgfun_impl(AMFTensor &amf, XTensor &x,
(fullfft_x = 0).run(stream);
matx::copy(partfft_x, x_normdiv_v, stream);

fft_impl(fullfft_x, fullfft_x, 0, stream);
fft_impl(fullfft_x, fullfft_x, 0, FFTNorm::BACKWARD, stream);
AmbgFftXOp(fullfft_x, fullfft_x, fs, cut_val, (float)nfreq).run(stream);
ifft_impl(fullfft_x, fullfft_x, 0, stream);
ifft_impl(fullfft_x, fullfft_x, 0, FFTNorm::BACKWARD, stream);

auto fullfft_y = make_tensor<T1>({nfreq}, MATX_ASYNC_DEVICE_MEMORY, stream);
(fullfft_y = 0).run(stream);

auto partfft_y = fullfft_y.Slice({0}, {xlen});
matx::copy(partfft_y, y_normdiv_v, stream);
(fullfft_y = fullfft_y * conj(fullfft_x)).run(stream);
ifft_impl(fullfft_y, fullfft_y, 0, stream);
ifft_impl(fullfft_y, fullfft_y, 0, FFTNorm::BACKWARD, stream);

// This allocation should not be necessary, but we're getting compiler
// errors when cloning/slicing
Expand All @@ -251,7 +252,7 @@ void ambgfun_impl(AMFTensor &amf, XTensor &x,

(fullfft_y = 0).run(stream);
matx::copy(partfft_y, y_normdiv_v, stream);
fft_impl(fullfft_y, fullfft_y, 0, stream);
fft_impl(fullfft_y, fullfft_y, 0, FFTNorm::BACKWARD, stream);

auto fullfft_x = make_tensor<T1>({len_seq - 1}, MATX_ASYNC_DEVICE_MEMORY, stream);
(fullfft_x = 0).run(stream);
Expand All @@ -260,13 +261,13 @@ void ambgfun_impl(AMFTensor &amf, XTensor &x,
auto partfft_x = make_tensor(fullfft_x.GetStorage(), xnd_size);

AmbgDoppX(partfft_x, x_normdiv_v, fs, cut_val).run(stream);
fft_impl(fullfft_x, fullfft_x, 0, stream);
fft_impl(fullfft_x, fullfft_x, 0, FFTNorm::BACKWARD, stream);

// This allocation should not be necessary, but we're getting compiler
// errors when cloning/slicing
auto amf_tmp_v = make_tensor<T1>({fullfft_x.Size(0)}, MATX_ASYNC_DEVICE_MEMORY, stream);
(fullfft_y = fullfft_y * conj(fullfft_x)).run(stream);
ifft_impl(fullfft_y, fullfft_y, 0, stream);
ifft_impl(fullfft_y, fullfft_y, 0, FFTNorm::BACKWARD, stream);

(amf_tmp_v = abs(fftshift1D(fullfft_y))).run(stream);

Expand Down
87 changes: 72 additions & 15 deletions include/matx/transforms/fft.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@
#include <optional>

namespace matx {

enum class FFTNorm {
BACKWARD, /// fft is unscaled, ifft is 1/N
FORWARD, /// fft is scaled 1/N, ifft is not scaled
ORTHO /// fft is scaled 1/sqrt(N), ifft is scaled 1/sqrt(N)
};

namespace detail {

static constexpr int MAX_FFT_RANK = 2;
Expand Down Expand Up @@ -96,11 +103,28 @@ template <typename OutTensorType, typename InTensorType> class matxFFTPlan_t {
* CUDA stream
**/
void inline Forward(OutTensorType &o,
const InTensorType &i, cudaStream_t stream)
const InTensorType &i, cudaStream_t stream, FFTNorm norm = FFTNorm::BACKWARD)
{
MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
cufftSetStream(this->plan_, stream);

// Normalize input if necessary
using s_type = typename detail::value_promote_t<typename InTensorType::scalar_type>;
s_type factor;
if (params_.fft_rank == 1) {
factor = static_cast<s_type>(params_.n[0]);
} else {
factor = static_cast<s_type>(params_.n[0] * params_.n[1]);
}

Exec(o, i, CUFFT_FORWARD);

if (norm == FFTNorm::ORTHO) {
(o *= 1.0 / std::sqrt(factor)).run(stream);
} else if (norm == FFTNorm::FORWARD) {
(o *= 1.0 / factor).run(stream);
}

}

/**
Expand All @@ -118,21 +142,28 @@ template <typename OutTensorType, typename InTensorType> class matxFFTPlan_t {
* CUDA stream
**/
void inline Inverse(OutTensorType &o,
const InTensorType &i, cudaStream_t stream)
const InTensorType &i, cudaStream_t stream, FFTNorm norm = FFTNorm::BACKWARD)
{
MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
cufftSetStream(this->plan_, stream);
Exec(o, i, CUFFT_INVERSE);

// cuFFT doesn't scale IFFT the same as MATLAB/Python. Scale it here to
// match
using s_type = typename detail::value_promote_t<typename OutTensorType::scalar_type>;
s_type factor;
if (params_.fft_rank == 1) {
(o = o * 1.0 / static_cast<double>(params_.n[0])).run(stream);
factor = static_cast<s_type>(params_.n[0]);
} else {
factor = static_cast<s_type>(params_.n[0] * params_.n[1]);
}
else {
(o = o * 1.0 / static_cast<double>(params_.n[0] * params_.n[1]))
.run(stream);

if (norm == FFTNorm::ORTHO) {
(o *= 1.0 / std::sqrt(factor)).run(stream);
} else if (norm == FFTNorm::BACKWARD) {
(o *= 1.0 / factor).run(stream);
}

}

static FftParams_t GetFFTParams(OutTensorType &o,
Expand Down Expand Up @@ -803,9 +834,38 @@ __MATX_INLINE__ auto getCufft2DSupportedTensor( const TensorOp &in, cudaStream_t
} // end namespace detail



/**
* Run a 1D FFT with a cached plan
*
* Creates a new FFT plan in the cache if none exists, and uses that to execute
* the 1D FFT. Note that FFTs and IFFTs share the same plans if all dimensions
* match
*
* @tparam OutputTensor
* Output tensor or operator type
* @tparam InputTensor
* Input tensor or operator type
* @param o
* Output tensor or operator. The length of the fastest-changing dimension dictates the
* size of FFT. If this size is longer than the length of the input tensor, the
* tensor will potentially be copied and zero-padded to a new block of memory.
* Future releases may remove this restriction to where there is no copy.
*
* Note: fft_size must be unsigned so that the axis overload does not match both
* prototypes with index_t.
* @param i
* input tensor or operator
* @param fft_size
* Size of FFT. Setting to 0 uses the output size to figure out the FFT size.
* @param norm
* Normalization to apply to IFFT
* @param stream
* CUDA stream
*/
template <typename OutputTensor, typename InputTensor>
__MATX_INLINE__ void fft_impl(OutputTensor o, const InputTensor i,
uint64_t fft_size = 0, cudaStream_t stream = 0)
uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD, cudaStream_t stream = 0)
{
MATX_STATIC_ASSERT_STR(OutputTensor::Rank() == InputTensor::Rank(), matxInvalidDim,
"Input and output tensor ranks must match");
Expand Down Expand Up @@ -834,11 +894,11 @@ __MATX_INLINE__ void fft_impl(OutputTensor o, const InputTensor i,
if (ret == std::nullopt) {
auto tmp = new detail::matxFFTPlan1D_t<decltype(out), decltype(in)>{out, in};
detail::cache_1d.Insert(params, static_cast<void *>(tmp));
tmp->Forward(out, in, stream);
tmp->Forward(out, in, stream, norm);
}
else {
auto fft_type = static_cast<detail::matxFFTPlan1D_t<decltype(out), decltype(in)> *>(ret.value());
fft_type->Forward(out, in, stream);
fft_type->Forward(out, in, stream, norm);
}

if(!out.isSameView(o)) {
Expand All @@ -847,11 +907,9 @@ __MATX_INLINE__ void fft_impl(OutputTensor o, const InputTensor i,
}




template <typename OutputTensor, typename InputTensor>
__MATX_INLINE__ void ifft_impl(OutputTensor o, const InputTensor i,
uint64_t fft_size = 0, cudaStream_t stream = 0)
uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD, cudaStream_t stream = 0)
{
MATX_STATIC_ASSERT_STR(OutputTensor::Rank() == InputTensor::Rank(), matxInvalidDim,
"Input and output tensor ranks must match");
Expand Down Expand Up @@ -879,11 +937,11 @@ __MATX_INLINE__ void ifft_impl(OutputTensor o, const InputTensor i,
if (ret == std::nullopt) {
auto tmp = new detail::matxFFTPlan1D_t<decltype(out), decltype(in)>{out, in};
detail::cache_1d.Insert(params, static_cast<void *>(tmp));
tmp->Inverse(out, in, stream);
tmp->Inverse(out, in, stream, norm);
}
else {
auto fft_type = static_cast<detail::matxFFTPlan1D_t<decltype(out), decltype(in)> *>(ret.value());
fft_type->Inverse(out, in, stream);
fft_type->Inverse(out, in, stream, norm);
}

if(!out.isSameView(o)) {
Expand All @@ -892,7 +950,6 @@ __MATX_INLINE__ void ifft_impl(OutputTensor o, const InputTensor i,
}



/**
* Run a 2D FFT with a cached plan
*
Expand Down
Loading