NVIDIA · cliffburdick · Jul 27, 2023 · Jul 24, 2023 · Jul 25, 2023 · Jul 25, 2023
diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
@@ -120,13 +120,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     if (i == 1) {
       cudaEventRecord(start, stream);
     }    
-    fft_impl(sig_freq, sig_time, 0, stream);
-    fft_impl(filt_freq, filt_time, 0, stream);
+    (sig_freq = fft(sig_time)).run(stream);
+    (filt_freq = fft(filt_time)).run(stream);
 
     (sig_freq = sig_freq * filt_freq).run(stream);
 
     // IFFT in-place
     (sig_freq = ifft(sig_freq)).run(stream);
+
   }
 
   cudaEventRecord(stop, stream);

diff --git a/include/matx/core/type_utils.h b/include/matx/core/type_utils.h
@@ -669,7 +669,7 @@ using extract_scalar_type_t = typename detail::extract_scalar_type_impl<T>::scal
  * @tparam T Type to convert
  */
 template <typename T>
-using promote_half_t = typename std::conditional_t<is_half_v<T>, float, T>;
+using promote_half_t = typename std::conditional_t<is_half_v<T> || is_matx_half_v<T>, float, T>;
 
 
 

diff --git a/include/matx/operators/dct.h b/include/matx/operators/dct.h
@@ -39,6 +39,7 @@
 #include "matx/core/error.h"
 #include "matx/core/tensor.h"
 #include "matx/core/type_utils.h"
+#include "matx/transforms/fft.h"
 
 namespace matx {
 
@@ -102,7 +103,7 @@ void dct(OutputTensor &out, const InputTensor &in,
 
   tensor_t<cuda::std::complex<typename OutputTensor::scalar_type>, 1> tmp{{N + 1}};
 
-  fft_impl(tmp, in, 0, stream);
+  fft_impl(tmp, in, 0, FFTNorm::BACKWARD, stream);
   auto s = tmp.Slice({0}, {N});
   detail::dctOp(out, s, N).run(stream);
 }

diff --git a/include/matx/operators/fft.h b/include/matx/operators/fft.h
@@ -48,6 +48,7 @@ namespace matx
         uint64_t fft_size_;
         PermDims perm_;
         FFTType type_;
+        FFTNorm norm_;
         std::array<index_t, OpA::Rank()> out_dims_;
         matx::tensor_t<std::conditional_t<is_complex_v<typename OpA::scalar_type>, 
                                           typename OpA::scalar_type, 
@@ -68,7 +69,8 @@ namespace matx
           }
         }
 
-        __MATX_INLINE__ FFTOp(OpA a, uint64_t size, PermDims perm, FFTType t) : a_(a), fft_size_(size),  perm_(perm), type_(t) {
+        __MATX_INLINE__ FFTOp(OpA a, uint64_t size, PermDims perm, FFTType t, FFTNorm norm) : 
+            a_(a), fft_size_(size),  perm_(perm), type_(t), norm_(norm) {
           for (int r = 0; r < Rank(); r++) {
             out_dims_[r] = a_.Size(r);
           }
@@ -113,18 +115,18 @@ namespace matx
           static_assert(is_device_executor_v<Executor>, "fft()/ifft() only supports the CUDA executor currently");
           if constexpr (std::is_same_v<PermDims, no_permute_t>) {
             if constexpr (std::is_same_v<FFTType, fft_t>) { 
-              fft_impl(std::get<0>(out), a_, fft_size_, ex.getStream());
+              fft_impl(std::get<0>(out), a_, fft_size_, norm_, ex.getStream());
             }
             else {
-              ifft_impl(std::get<0>(out), a_, fft_size_, ex.getStream());
+              ifft_impl(std::get<0>(out), a_, fft_size_, norm_, ex.getStream());
             }
           }
           else {
             if constexpr (std::is_same_v<FFTType, fft_t>) { 
-              fft_impl(permute(std::get<0>(out), perm_), permute(a_, perm_), fft_size_, ex.getStream());
+              fft_impl(permute(std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex.getStream());
             }
             else {
-              ifft_impl(permute(std::get<0>(out), perm_), permute(a_, perm_), fft_size_, ex.getStream());
+              ifft_impl(permute(std::get<0>(out), perm_), permute(a_, perm_), fft_size_, norm_, ex.getStream());
             }
           }
         }
@@ -161,10 +163,12 @@ namespace matx
    *   input tensor or operator
    * @param fft_size
    *   Size of FFT. Setting to 0 uses the output size to figure out the FFT size.
+   * @param norm
+   *   Normalization to apply to FFT
    */
   template<typename OpA>
-  __MATX_INLINE__ auto fft(OpA &&a, uint64_t fft_size = 0) {
-    return detail::FFTOp(a, fft_size, detail::no_permute_t{}, detail::fft_t{});
+  __MATX_INLINE__ auto fft(OpA &&a, uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD) {
+    return detail::FFTOp(a, fft_size, detail::no_permute_t{}, detail::fft_t{}, norm);
   }
 
   /**
@@ -184,12 +188,14 @@ namespace matx
    *   axis to perform FFT along
    * @param fft_size
    *   Size of FFT. Setting to 0 uses the output size to figure out the FFT size.
+   * @param norm
+   *   Normalization to apply to FFT
    */
   template<typename OpA>
-  __MATX_INLINE__ auto fft(OpA &&a, const int32_t (&axis)[1], uint64_t fft_size = 0) {
+  __MATX_INLINE__ auto fft(OpA &&a, const int32_t (&axis)[1], uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD) {
 
     auto perm = detail::getPermuteDims<remove_cvref_t<OpA>::Rank()>(axis);  
-    return detail::FFTOp(a, fft_size, perm, detail::fft_t{});
+    return detail::FFTOp(a, fft_size, perm, detail::fft_t{}, norm);
   }
 
   /**
@@ -208,10 +214,12 @@ namespace matx
    *   input tensor or operator
    * @param fft_size
    *   Size of FFT. Setting to 0 uses the output size to figure out the FFT size.
+   * @param norm
+   *   Normalization to apply to FFT
    */
   template<typename OpA>
-  __MATX_INLINE__ auto ifft(OpA &&a, uint64_t fft_size = 0) {
-    return detail::FFTOp(a, fft_size, detail::no_permute_t{}, detail::ifft_t{});
+  __MATX_INLINE__ auto ifft(OpA &&a, uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD) {
+    return detail::FFTOp(a, fft_size, detail::no_permute_t{}, detail::ifft_t{}, norm);
   }
 
   /**
@@ -231,11 +239,13 @@ namespace matx
    *   axis to perform FFT along
    * @param fft_size
    *   Size of FFT. Setting to 0 uses the output size to figure out the FFT size.
+   * @param norm
+   *   Normalization to apply to FFT
    */
   template<typename OpA>
-  __MATX_INLINE__ auto ifft(OpA &&a, const int32_t (&axis)[1], uint64_t fft_size = 0) {
+  __MATX_INLINE__ auto ifft(OpA &&a, const int32_t (&axis)[1], uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD) {
     auto perm = detail::getPermuteDims<remove_cvref_t<OpA>::Rank()>(axis);  
-    return detail::FFTOp(a, fft_size, perm, detail::ifft_t{});
+    return detail::FFTOp(a, fft_size, perm, detail::ifft_t{}, norm);
   }  
 
 

diff --git a/include/matx/transforms/ambgfun.h b/include/matx/transforms/ambgfun.h
@@ -43,6 +43,7 @@
 #include "matx/core/type_utils.h"
 
 #include "matx/transforms/copy.h"
+#include "matx/transforms/fft.h"
 
 namespace matx {
 typedef enum {
@@ -208,7 +209,7 @@ void ambgfun_impl(AMFTensor &amf, XTensor &x,
     (fullfft = 0).run(stream);
     matx::copy(partfft, new_ynorm_v, stream);
 
-    ifft_impl(fullfft, fullfft, 0, stream);
+    ifft_impl(fullfft, fullfft, 0, FFTNorm::BACKWARD, stream);
 
     // We need to temporarily allocate a complex output version of AMF since we
     // have no way to convert complex to real in an operator currently
@@ -223,17 +224,17 @@ void ambgfun_impl(AMFTensor &amf, XTensor &x,
     (fullfft_x = 0).run(stream);
     matx::copy(partfft_x, x_normdiv_v, stream);
 
-    fft_impl(fullfft_x, fullfft_x, 0, stream);
+    fft_impl(fullfft_x, fullfft_x, 0, FFTNorm::BACKWARD, stream);
     AmbgFftXOp(fullfft_x, fullfft_x, fs, cut_val, (float)nfreq).run(stream);
-    ifft_impl(fullfft_x, fullfft_x, 0, stream);
+    ifft_impl(fullfft_x, fullfft_x, 0, FFTNorm::BACKWARD, stream);
 
     auto fullfft_y = make_tensor<T1>({nfreq}, MATX_ASYNC_DEVICE_MEMORY, stream);
     (fullfft_y = 0).run(stream);
 
     auto partfft_y = fullfft_y.Slice({0}, {xlen});
     matx::copy(partfft_y, y_normdiv_v, stream);
     (fullfft_y = fullfft_y * conj(fullfft_x)).run(stream);
-    ifft_impl(fullfft_y, fullfft_y, 0, stream);
+    ifft_impl(fullfft_y, fullfft_y, 0, FFTNorm::BACKWARD, stream);
 
     // This allocation should not be necessary, but we're getting compiler
     // errors when cloning/slicing
@@ -251,7 +252,7 @@ void ambgfun_impl(AMFTensor &amf, XTensor &x,
 
     (fullfft_y = 0).run(stream);
     matx::copy(partfft_y, y_normdiv_v, stream);
-    fft_impl(fullfft_y, fullfft_y, 0, stream);
+    fft_impl(fullfft_y, fullfft_y, 0, FFTNorm::BACKWARD, stream);
 
     auto fullfft_x = make_tensor<T1>({len_seq - 1}, MATX_ASYNC_DEVICE_MEMORY, stream);
     (fullfft_x = 0).run(stream);
@@ -260,13 +261,13 @@ void ambgfun_impl(AMFTensor &amf, XTensor &x,
     auto partfft_x = make_tensor(fullfft_x.GetStorage(), xnd_size);
 
     AmbgDoppX(partfft_x, x_normdiv_v, fs, cut_val).run(stream);
-    fft_impl(fullfft_x, fullfft_x, 0, stream);
+    fft_impl(fullfft_x, fullfft_x, 0, FFTNorm::BACKWARD, stream);
 
     // This allocation should not be necessary, but we're getting compiler
     // errors when cloning/slicing
     auto amf_tmp_v = make_tensor<T1>({fullfft_x.Size(0)}, MATX_ASYNC_DEVICE_MEMORY, stream);
     (fullfft_y = fullfft_y * conj(fullfft_x)).run(stream);
-    ifft_impl(fullfft_y, fullfft_y, 0, stream);
+    ifft_impl(fullfft_y, fullfft_y, 0, FFTNorm::BACKWARD, stream);
 
     (amf_tmp_v = abs(fftshift1D(fullfft_y))).run(stream);
 

diff --git a/include/matx/transforms/fft.h b/include/matx/transforms/fft.h
@@ -47,6 +47,13 @@
 #include <optional>
 
 namespace matx {
+
+enum class FFTNorm {
+  BACKWARD, /// fft is unscaled, ifft is 1/N
+  FORWARD, /// fft is scaled 1/N, ifft is not scaled
+  ORTHO /// fft is scaled 1/sqrt(N), ifft is scaled 1/sqrt(N)
+};
+
 namespace detail {
 
 static constexpr int MAX_FFT_RANK = 2;
@@ -96,11 +103,28 @@ template <typename OutTensorType, typename InTensorType> class matxFFTPlan_t {
    *   CUDA stream
    **/
   void inline Forward(OutTensorType &o,
-                      const InTensorType &i, cudaStream_t stream)
+                      const InTensorType &i, cudaStream_t stream, FFTNorm norm = FFTNorm::BACKWARD)
   {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
     cufftSetStream(this->plan_, stream);
+
+    // Normalize input if necessary
+    using s_type = typename detail::value_promote_t<typename InTensorType::scalar_type>;
+    s_type factor;
+    if (params_.fft_rank == 1) {
+      factor = static_cast<s_type>(params_.n[0]);
+    } else {
+      factor = static_cast<s_type>(params_.n[0] * params_.n[1]);
+    }
+
     Exec(o, i, CUFFT_FORWARD);
+
+    if (norm == FFTNorm::ORTHO) {
+      (o *= 1.0 / std::sqrt(factor)).run(stream);
+    } else if (norm == FFTNorm::FORWARD) {
+      (o *= 1.0 / factor).run(stream);
+    }
+
   }
 
   /**
@@ -118,21 +142,28 @@ template <typename OutTensorType, typename InTensorType> class matxFFTPlan_t {
    *   CUDA stream
    **/
   void inline Inverse(OutTensorType &o,
-                      const InTensorType &i, cudaStream_t stream)
+                      const InTensorType &i, cudaStream_t stream, FFTNorm norm = FFTNorm::BACKWARD)
   {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
     cufftSetStream(this->plan_, stream);
     Exec(o, i, CUFFT_INVERSE);
 
     // cuFFT doesn't scale IFFT the same as MATLAB/Python. Scale it here to
     // match
+    using s_type = typename detail::value_promote_t<typename OutTensorType::scalar_type>;
+    s_type factor;
     if (params_.fft_rank == 1) {
-      (o = o * 1.0 / static_cast<double>(params_.n[0])).run(stream);
+      factor = static_cast<s_type>(params_.n[0]);
+    } else {
+      factor = static_cast<s_type>(params_.n[0] * params_.n[1]);
     }
-    else {
-      (o = o * 1.0 / static_cast<double>(params_.n[0] * params_.n[1]))
-          .run(stream);
+
+    if (norm == FFTNorm::ORTHO) {
+      (o *= 1.0 / std::sqrt(factor)).run(stream);
+    } else if (norm == FFTNorm::BACKWARD) {
+      (o *= 1.0 / factor).run(stream);
     }
+
   }
 
   static FftParams_t GetFFTParams(OutTensorType &o,
@@ -803,9 +834,38 @@ __MATX_INLINE__ auto getCufft2DSupportedTensor( const TensorOp &in, cudaStream_t
 } // end namespace detail
 
 
+
+/**
+ * Run a 1D FFT with a cached plan
+ *
+ * Creates a new FFT plan in the cache if none exists, and uses that to execute
+ * the 1D FFT. Note that FFTs and IFFTs share the same plans if all dimensions
+ * match
+ *
+ * @tparam OutputTensor
+ *   Output tensor or operator type
+ * @tparam InputTensor
+ *   Input tensor or operator type
+ * @param o
+ *   Output tensor or operator. The length of the fastest-changing dimension dictates the
+ * size of FFT. If this size is longer than the length of the input tensor, the
+ * tensor will potentially be copied and zero-padded to a new block of memory.
+ * Future releases may remove this restriction to where there is no copy.
+ * 
+ * Note: fft_size must be unsigned so that the axis overload does not match both 
+ * prototypes with index_t. 
+ * @param i
+ *   input tensor or operator
+ * @param fft_size
+ *   Size of FFT. Setting to 0 uses the output size to figure out the FFT size.
+ * @param norm
+ *   Normalization to apply to IFFT
+ * @param stream
+ *   CUDA stream
+ */
 template <typename OutputTensor, typename InputTensor>
 __MATX_INLINE__ void fft_impl(OutputTensor o, const InputTensor i,
-         uint64_t fft_size = 0, cudaStream_t stream = 0)
+         uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD, cudaStream_t stream = 0)
 {
   MATX_STATIC_ASSERT_STR(OutputTensor::Rank() == InputTensor::Rank(), matxInvalidDim,
     "Input and output tensor ranks must match");  
@@ -834,11 +894,11 @@ __MATX_INLINE__ void fft_impl(OutputTensor o, const InputTensor i,
   if (ret == std::nullopt) {
     auto tmp = new detail::matxFFTPlan1D_t<decltype(out), decltype(in)>{out, in};
     detail::cache_1d.Insert(params, static_cast<void *>(tmp));
-    tmp->Forward(out, in, stream);
+    tmp->Forward(out, in, stream, norm);
   }
   else {
     auto fft_type = static_cast<detail::matxFFTPlan1D_t<decltype(out), decltype(in)> *>(ret.value());
-    fft_type->Forward(out, in, stream);
+    fft_type->Forward(out, in, stream, norm);
   }
 
   if(!out.isSameView(o)) {
@@ -847,11 +907,9 @@ __MATX_INLINE__ void fft_impl(OutputTensor o, const InputTensor i,
 }
 
 
-
-
 template <typename OutputTensor, typename InputTensor>
 __MATX_INLINE__ void ifft_impl(OutputTensor o, const InputTensor i,
-          uint64_t fft_size = 0, cudaStream_t stream = 0)
+          uint64_t fft_size = 0, FFTNorm norm = FFTNorm::BACKWARD, cudaStream_t stream = 0)
 {
   MATX_STATIC_ASSERT_STR(OutputTensor::Rank() == InputTensor::Rank(), matxInvalidDim,
     "Input and output tensor ranks must match");
@@ -879,11 +937,11 @@ __MATX_INLINE__ void ifft_impl(OutputTensor o, const InputTensor i,
   if (ret == std::nullopt) {
     auto tmp = new detail::matxFFTPlan1D_t<decltype(out), decltype(in)>{out, in};
     detail::cache_1d.Insert(params, static_cast<void *>(tmp));
-    tmp->Inverse(out, in, stream);
+    tmp->Inverse(out, in, stream, norm);
   }
   else {
     auto fft_type = static_cast<detail::matxFFTPlan1D_t<decltype(out), decltype(in)> *>(ret.value());
-    fft_type->Inverse(out, in, stream);
+    fft_type->Inverse(out, in, stream, norm);
   }
 
   if(!out.isSameView(o)) {
@@ -892,7 +950,6 @@ __MATX_INLINE__ void ifft_impl(OutputTensor o, const InputTensor i,
 }
 
 
-
 /**
  * Run a 2D FFT with a cached plan
  *